ref: 4390f83cecc820beea75016b9aa74544d9d5d3e4
parent: e9980927f7ddd347981839aec144441aa78258d4
author: gxw <[email protected]>
date: Wed Aug 1 05:42:14 EDT 2018
Add support for loongson platform Add optimized file codec/commom/mips64/deblock_mmi.c and corresponding unit tests for loongson platform Change-Id: Icfbdd1f5f58d5e4a1abfb6150c7135ee9f227ba2
--- a/build/arch.mk
+++ b/build/arch.mk
@@ -29,3 +29,15 @@
CFLAGS += -DHAVE_NEON_AARCH64
endif
endif
+
+#for loongson
+ifneq ($(filter mips64, $(ARCH)),)
+ifeq ($(USE_ASM), Yes)
+ASM_ARCH = mips64
+ASMFLAGS += -I$(SRC_PATH)codec/common/mips64/
+LOONGSON3A = $(shell g++ -dM -E - < /dev/null | grep '_MIPS_TUNE ' | cut -f 3 -d " ")
+ifeq ($(LOONGSON3A), "loongson3a")
+CFLAGS += -DHAVE_MMI
+endif
+endif
+endif
--- /dev/null
+++ b/codec/common/inc/asmdefs_mmi.h
@@ -1,0 +1,339 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Loongson Technology Co.,Ltd.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef ASMDEFS_MMI_H_
+#define ASMDEFS_MMI_H_
+
+#define CACHE_LINE_SIZE 32
+
+#if defined(__mips64) && defined(__LP64__)
+# define mips_reg int64_t
+# define PTR_ADDU "daddu "
+# define PTR_ADDIU "daddiu "
+# define PTR_ADDI "daddi "
+# define PTR_SUBU "dsubu "
+# define PTR_L "ld "
+# define PTR_SRA "dsra "
+# define PTR_SRL "dsrl "
+# define PTR_SLL "dsll "
+# define PTR_MTC1 "dmtc1 "
+# define PTR_LI "dli "
+#else
+# define mips_reg int32_t
+# define PTR_ADDU "addu "
+# define PTR_ADDIU "addiu "
+# define PTR_ADDI "addi "
+# define PTR_SUBU "subu "
+# define PTR_L "lw "
+# define PTR_SRA "sra "
+# define PTR_SRL "srl "
+# define PTR_SLL "sll "
+# define PTR_MTC1 "mtc1 "
+# define PTR_LI "li "
+#endif
+
+#define MMI_XSawp_BH(f0, f2, f4, f6, f8, f10) \
+ "mov.d "#f8", "#f2" \n\t" \
+ "punpckhbh "#f2", "#f0", "#f4" \n\t" \
+ "punpcklbh "#f0", "#f0", "#f4" \n\t" \
+ "punpckhbh "#f10", "#f8", "#f6" \n\t" \
+ "punpcklbh "#f8", "#f8", "#f6" \n\t"
+
+#define MMI_XSawp_HW(f0, f2, f4, f6, f8, f10) \
+ "mov.d "#f8", "#f2" \n\t" \
+ "punpckhhw "#f2", "#f0", "#f4" \n\t" \
+ "punpcklhw "#f0", "#f0", "#f4" \n\t" \
+ "punpckhhw "#f10", "#f8", "#f6" \n\t" \
+ "punpcklhw "#f8", "#f8", "#f6" \n\t"
+
+#define MMI_XSawp_WD(f0, f2, f4, f6, f8, f10) \
+ "mov.d "#f8", "#f2" \n\t" \
+ "punpckhwd "#f2", "#f0", "#f4" \n\t" \
+ "punpcklwd "#f0", "#f0", "#f4" \n\t" \
+ "punpckhwd "#f10", "#f8", "#f6" \n\t" \
+ "punpcklwd "#f8", "#f8", "#f6" \n\t"
+
+#define MMI_XSawp_DQ(f0, f2, f4, f6, f8, f10) \
+ "mov.d "#f8", "#f2" \n\t" \
+ "mov.d "#f2", "#f4" \n\t" \
+ "mov.d "#f10", "#f6" \n\t"
+
+#define WELS_AbsH(f0, f2, f4, f6, f8, f10) \
+ "xor "#f8", "#f8", "#f8" \n\t" \
+ "psubh "#f10", "#f8", "#f6" \n\t" \
+ "psubh "#f8", "#f8", "#f4" \n\t" \
+ "pmaxsh "#f0", "#f4", "#f8" \n\t" \
+ "pmaxsh "#f2", "#f6", "#f10" \n\t"
+
+#define MMI_SumSub(f0, f2, f4, f6, f8, f10) \
+ "mov.d "#f8", "#f4" \n\t" \
+ "mov.d "#f10", "#f6" \n\t" \
+ "paddh "#f4", "#f4", "#f0" \n\t" \
+ "paddh "#f6", "#f6", "#f2" \n\t" \
+ "psubh "#f0", "#f0", "#f8" \n\t" \
+ "psubh "#f2", "#f2", "#f10" \n\t"
+
+#define MMI_LoadDiff8P(f0, f2, f4, f6, f8, r0, r1) \
+ "gsldlc1 "#f0", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r1") \n\t" \
+ "gsldrc1 "#f0", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r1") \n\t" \
+ "punpckhbh "#f2", "#f0", "#f8" \n\t" \
+ "punpcklbh "#f0", "#f0", "#f8" \n\t" \
+ "punpckhbh "#f6", "#f4", "#f8" \n\t" \
+ "punpcklbh "#f4", "#f4", "#f8" \n\t" \
+ "psubh "#f0", "#f0", "#f4" \n\t" \
+ "psubh "#f2", "#f2", "#f6" \n\t"
+
+#define MMI_TransTwo4x4H(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
+ MMI_XSawp_HW(f0, f2, f4, f6, f16, f18) \
+ MMI_XSawp_HW(f8, f10, f12, f14, f4, f6) \
+ MMI_XSawp_WD(f0, f2, f8, f10, f12, f14) \
+ MMI_XSawp_WD(f16, f18, f4, f6, f8, f10) \
+ MMI_XSawp_DQ(f0, f2, f16, f18, f4, f6) \
+ MMI_XSawp_DQ(f12, f14, f8, f10, f16, f18)
+
+#define MMI_TransTwo8x8B(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22, f24, f26, f28, f30, r0, r1) \
+ "dmfc1 "#r0", "#f28" \n\t" \
+ "dmfc1 "#r1", "#f30" \n\t" \
+ MMI_XSawp_BH(f0, f2, f4, f6, f28, f30) \
+ MMI_XSawp_BH(f8, f10, f12, f14, f4, f6) \
+ MMI_XSawp_BH(f16, f18, f20, f22, f12, f14) \
+ "dmtc1 "#r0", "#f20" \n\t" \
+ "dmtc1 "#r1", "#f22" \n\t" \
+ "dmfc1 "#r0", "#f12" \n\t" \
+ "dmfc1 "#r1", "#f14" \n\t" \
+ MMI_XSawp_BH(f24, f26, f20, f22, f12, f14) \
+ MMI_XSawp_HW(f0, f2, f8, f10, f20, f22) \
+ MMI_XSawp_HW(f28, f30, f4, f6, f8, f10) \
+ MMI_XSawp_HW(f16, f18, f24, f26, f4, f6) \
+ "dmtc1 "#r0", "#f24" \n\t" \
+ "dmtc1 "#r1", "#f26" \n\t" \
+ "dmfc1 "#r0", "#f8" \n\t" \
+ "dmfc1 "#r1", "#f10" \n\t" \
+ MMI_XSawp_HW(f24, f26, f12, f14, f8, f10) \
+ MMI_XSawp_WD(f0, f2, f16, f18, f12, f14) \
+ MMI_XSawp_WD(f20, f22, f4, f6, f16, f18) \
+ MMI_XSawp_WD(f28, f30, f24, f26, f4, f6) \
+ "dmtc1 "#r0", "#f24" \n\t" \
+ "dmtc1 "#r1", "#f26" \n\t" \
+ "dmfc1 "#r0", "#f16" \n\t" \
+ "dmfc1 "#r1", "#f18" \n\t" \
+ MMI_XSawp_WD(f24, f26, f8, f10, f16, f18) \
+ MMI_XSawp_DQ(f0, f2, f28, f30, f8, f10) \
+ MMI_XSawp_DQ(f12, f14, f4, f6, f28, f30) \
+ MMI_XSawp_DQ(f20, f22, f24, f26, f4, f6) \
+ "dmtc1 "#r0", "#f24" \n\t" \
+ "dmtc1 "#r1", "#f26" \n\t" \
+ "dmfc1 "#r0", "#f0" \n\t" \
+ "dmfc1 "#r1", "#f2" \n\t" \
+ MMI_XSawp_DQ(f24, f26, f16, f18, f0, f2) \
+ "dmtc1 "#r0", "#f16" \n\t" \
+ "dmtc1 "#r1", "#f18" \n\t"
+
+#define MMI_XSwap_HW_SINGLE(f0, f2, f4) \
+ "mov.d "#f4", "#f0" \n\t" \
+ "punpckhhw "#f4", "#f4", "#f2" \n\t" \
+ "punpcklhw "#f0", "#f0", "#f2" \n\t"
+
+#define MMI_XSwap_WD_SINGLE(f0, f2, f4) \
+ "mov.d "#f4", "#f0" \n\t" \
+ "punpckhwd "#f4", "#f4", "#f2" \n\t" \
+ "punpcklwd "#f0", "#f0", "#f2" \n\t"
+
+#define MMI_Trans4x4H_SINGLE(f0, f2, f4, f6, f8) \
+ MMI_XSwap_HW_SINGLE(f0, f2, f8) \
+ MMI_XSwap_HW_SINGLE(f4, f6, f2) \
+ MMI_XSwap_WD_SINGLE(f0, f4, f6) \
+ MMI_XSwap_WD_SINGLE(f8, f2, f4)
+
+#define MMI_SumSub_SINGLE(f0, f2, f4) \
+ "mov.d "#f4", "#f2" \n\t" \
+ "psubh "#f2", "#f2", "#f0" \n\t" \
+ "paddh "#f0", "#f0", "#f4" \n\t"
+
+#define MMI_SumSubMul2_SINGLE(f0, f2, f4, f6) \
+ "mov.d "#f4", "#f0" \n\t" \
+ "psllh "#f0", "#f0", "#f6" \n\t" \
+ "paddh "#f0", "#f0", "#f2" \n\t" \
+ "psllh "#f2", "#f2", "#f6" \n\t" \
+ "psubh "#f4", "#f4", "#f2" \n\t"
+
+//f4 should be 0x0
+#define MMI_Copy8Times(f0, f2, f4, r0) \
+ "dmtc1 "#r0", "#f0" \n\t" \
+ "pshufh "#f0", "#f0", "#f4" \n\t" \
+ "mov.d "#f2", "#f0" \n\t"
+
+//f4 should be 0x0
+#define MMI_Copy16Times(f0, f2, f4, r0) \
+ "dmtc1 "#r0", "#f0" \n\t" \
+ "punpcklbh "#f0", "#f0", "#f0" \n\t" \
+ "pshufh "#f0", "#f0", "#f4" \n\t" \
+ "mov.d "#f2", "#f0" \n\t"
+
+#define MMI_SumSubDiv2_SINGLE(f0, f2, f4, f6) \
+ "psrah "#f4", "#f2", "#f6" \n\t" \
+ "paddh "#f4", "#f4", "#f0" \n\t" \
+ "psrah "#f0", "#f0", "#f6" \n\t" \
+ "psubh "#f0", "#f0", "#f2" \n\t"
+
+#define MMI_IDCT_SINGLE(f0, f2, f4, f6, f8, f10, f12) \
+ MMI_SumSub_SINGLE(f6, f8, f10) \
+ MMI_SumSubDiv2_SINGLE(f4, f2, f0, f12) \
+ MMI_SumSub_SINGLE(f0, f6, f10) \
+ MMI_SumSub_SINGLE(f4, f8, f10)
+
+#define MMI_StoreDiff4P_SINGLE(f0, f2, f4, f6, r0, r1, f8) \
+ "gsldlc1 "#f2", 0x7("#r1") \n\t" \
+ "gsldrc1 "#f2", 0x0("#r1") \n\t" \
+ "punpcklbh "#f2", "#f2", "#f6" \n\t" \
+ "paddh "#f0", "#f0", "#f4" \n\t" \
+ "psrah "#f0", "#f0", "#f8" \n\t" \
+ "paddsh "#f0", "#f0", "#f2" \n\t" \
+ "packushb "#f0", "#f0", "#f2" \n\t" \
+ "gsswlc1 "#f0", 0x3("#r0") \n\t" \
+ "gsswrc1 "#f0", 0x0("#r0") \n\t"
+
+#define SUMH_HORIZON(f0, f2, f4, f6, f8) \
+ "paddh "#f0", "#f0", "#f2" \n\t" \
+ "punpckhhw "#f2", "#f0", "#f8" \n\t" \
+ "punpcklhw "#f0", "#f0", "#f8" \n\t" \
+ "paddw "#f0", "#f0", "#f2" \n\t" \
+ "punpckhwd "#f2", "#f0", "#f0" \n\t" \
+ "paddw "#f0", "#f0", "#f2" \n\t"
+
+#define LOAD_COLUMN(f0, f2, f4, f6, f8, f10, f12, f14, r0, r1, r2) \
+ "daddu "#r2", "#r0", "#r1" \n\t" \
+ "gsldlc1 "#f0", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r2") \n\t" \
+ "gsldrc1 "#f0", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r2") \n\t" \
+ "punpcklbh "#f0", "#f0", "#f4" \n\t" \
+ "daddu "#r0", "#r2", "#r1" \n\t" \
+ "daddu "#r2", "#r0", "#r1" \n\t" \
+ "gsldlc1 "#f8", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r2") \n\t" \
+ "gsldrc1 "#f8", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r2") \n\t" \
+ "punpcklbh "#f8", "#f8", "#f4" \n\t" \
+ "punpckhhw "#f2", "#f0", "#f8" \n\t" \
+ "punpcklhw "#f0", "#f0", "#f8" \n\t" \
+ "daddu "#r0", "#r2", "#r1" \n\t" \
+ "daddu "#r2", "#r0", "#r1" \n\t" \
+ "gsldlc1 "#f12", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r2") \n\t" \
+ "gsldrc1 "#f12", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r2") \n\t" \
+ "punpcklbh "#f12", "#f12", "#f4" \n\t" \
+ "daddu "#r0", "#r2", "#r1" \n\t" \
+ "daddu "#r2", "#r0", "#r1" \n\t" \
+ "gsldlc1 "#f8", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r2") \n\t" \
+ "gsldrc1 "#f8", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r2") \n\t" \
+ "punpcklbh "#f8", "#f8", "#f4" \n\t" \
+ "punpckhhw "#f14", "#f12", "#f8" \n\t" \
+ "punpcklhw "#f12", "#f12", "#f8" \n\t" \
+ "daddu "#r0", "#r2", "#r1" \n\t" \
+ "punpcklwd "#f0", "#f2", "#f14" \n\t" \
+ "punpckhwd "#f2", "#f2", "#f14" \n\t"
+
+#define LOAD_COLUMN_C(f0, f2, f4, f6, r0, r1, r2) \
+ "daddu "#r2", "#r0", "#r1" \n\t" \
+ "gsldlc1 "#f0", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f2", 0x7("#r2") \n\t" \
+ "gsldrc1 "#f0", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f2", 0x0("#r2") \n\t" \
+ "punpcklbh "#f0", "#f0", "#f2" \n\t" \
+ "daddu "#r0", "#r2", "#r1" \n\t" \
+ "daddu "#r2", "#r0", "#r1" \n\t" \
+ "gsldlc1 "#f4", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f2", 0x7("#r2") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f2", 0x0("#r2") \n\t" \
+ "punpcklbh "#f4", "#f4", "#f2" \n\t" \
+ "punpckhhw "#f0", "#f0", "#f4" \n\t" \
+ "daddu "#r0", "#r2", "#r1" \n\t"
+/**
+ * backup register
+ */
+#define BACKUP_REG \
+ double __back_temp[8]; \
+ if (_MIPS_SIM == _ABI64) \
+ __asm__ volatile ( \
+ "gssqc1 $f25, $f24, 0x00(%[temp]) \n\t" \
+ "gssqc1 $f27, $f26, 0x10(%[temp]) \n\t" \
+ "gssqc1 $f29, $f28, 0x20(%[temp]) \n\t" \
+ "gssqc1 $f31, $f30, 0x30(%[temp]) \n\t" \
+ : \
+ : [temp]"r"(__back_temp) \
+ : "memory" \
+ ); \
+ else \
+ __asm__ volatile ( \
+ "gssqc1 $f22, $f20, 0x00(%[temp]) \n\t" \
+ "gssqc1 $f26, $f24, 0x10(%[temp]) \n\t" \
+ "gssqc1 $f30, $f28, 0x20(%[temp]) \n\t" \
+ : \
+ : [temp]"r"(__back_temp) \
+ : "memory" \
+ );
+
+/**
+ * recover register
+ */
+#define RECOVER_REG \
+ if (_MIPS_SIM == _ABI64) \
+ __asm__ volatile ( \
+ "gslqc1 $f25, $f24, 0x00(%[temp]) \n\t" \
+ "gslqc1 $f27, $f26, 0x10(%[temp]) \n\t" \
+ "gslqc1 $f29, $f28, 0x20(%[temp]) \n\t" \
+ "gslqc1 $f31, $f30, 0x30(%[temp]) \n\t" \
+ : \
+ : [temp]"r"(__back_temp) \
+ : "memory" \
+ ); \
+ else \
+ __asm__ volatile ( \
+ "gslqc1 $f22, $f20, 0x00(%[temp]) \n\t" \
+ "gslqc1 $f26, $f24, 0x10(%[temp]) \n\t" \
+ "gslqc1 $f30, $f28, 0x20(%[temp]) \n\t" \
+ : \
+ : [temp]"r"(__back_temp) \
+ : "memory" \
+ );
+
+# define OK 1
+# define NOTOK 0
+
+#endif /* ASMDEFS_MMI_H_ */
--- a/codec/common/inc/cpu_core.h
+++ b/codec/common/inc/cpu_core.h
@@ -84,6 +84,9 @@
#define WELS_CPU_VFPv3 0x000002 /* VFPv3 */
#define WELS_CPU_NEON 0x000004 /* NEON */
+/* For loongson */
+#define WELS_CPU_MMI 0x00000001 /* mmi */
+
/*
* Interfaces for CPU core feature detection as below
*/
--- a/codec/common/inc/deblocking_common.h
+++ b/codec/common/inc/deblocking_common.h
@@ -75,6 +75,22 @@
void DeblockChromaEq4H_AArch64_neon (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
void WelsNonZeroCount_AArch64_neon (int8_t* pNonZeroCount);
#endif
+
+#if defined(HAVE_MMI)
+void DeblockLumaLt4V_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4V_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockLumaTransposeH2V_mmi (uint8_t* pPixY, int32_t iStride, uint8_t* pDst);
+void DeblockLumaTransposeV2H_mmi (uint8_t* pPixY, int32_t iStride, uint8_t* pSrc);
+void DeblockLumaLt4H_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4H_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaEq4V_mmi (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4V_mmi (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+ int8_t* pTC);
+void DeblockChromaEq4H_mmi (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4H_mmi (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+ int8_t* pTC);
+void WelsNonZeroCount_mmi (int8_t* pNonZeroCount);
+#endif//HAVE_MMI
#if defined(__cplusplus)
}
#endif//__cplusplus
--- /dev/null
+++ b/codec/common/mips64/deblock_mmi.c
@@ -1,0 +1,2826 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2018, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file deblock_mmi.c
+ *
+ * \brief Loongson optimize
+ *
+ * \date 20/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+void DeblockLumaLt4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
+ int32_t iBeta, int8_t *pTC) {
+ unsigned char tmp[512] __attribute__((aligned(32)));
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "dsll $8, %[iStride], 0x1 \n\t"
+ "daddu $8, $8, %[iStride] \n\t"
+ "dsubu $14, %[pPix], $8 \n\t"
+
+ "dsll $8, %[iStride], 0x1 \n\t"
+ "dsubu $9, %[pPix], $8 \n\t"
+
+ "dmtc1 %[iAlpha], $f0 \n\t"
+ "dsubu $13, %[pPix], %[iStride] \n\t"
+ "daddu %[iStride], %[iStride], %[pPix] \n\t"
+ "daddu $12, $8, %[pPix] \n\t"
+
+ "punpcklhw $f0, $f0, $f0 \n\t"
+ "lb $8, 0x0(%[pTC]) \n\t"
+ "punpcklwd $f0, $f0, $f0 \n\t"
+ "mov.d $f2, $f0 \n\t"
+ "gssqc1 $f2, $f0, 432-112(%[tmp]) \n\t"
+ "dmtc1 %[iBeta], $f0 \n\t"
+ "lb %[iAlpha], 0x1(%[pTC]) \n\t"
+ "dli %[iBeta], 0xFFFF \n\t"
+ "punpcklhw $f0, $f0, $f0 \n\t"
+ "and $10, %[iAlpha], %[iBeta] \n\t"
+ "punpcklwd $f0, $f0, $f0 \n\t"
+ "mov.d $f2, $f0 \n\t"
+ "and %[iAlpha], %[iAlpha], %[iBeta] \n\t"
+ "dmtc1 $10, $f4 \n\t"
+ "mov.d $f8, $f4 \n\t"
+ "dmtc1 %[iAlpha], $f16 \n\t"
+ "and %[iAlpha], $8, %[iBeta] \n\t"
+ "dmtc1 %[iAlpha], $f20 \n\t"
+ "mov.d $f24, $f20 \n\t"
+ "mov.d $f28, $f20 \n\t"
+ "gssqc1 $f2, $f0, 432-336(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f0 \n\t"
+
+ "lb %[iAlpha], 0x3(%[pTC]) \n\t"
+ "lb %[pTC], 0x2(%[pTC]) \n\t"
+ "dmtc1 $10, $f12 \n\t"
+ "punpcklhw $f0, $f0, $f16 \n\t"
+ "and $8, %[iAlpha], %[iBeta] \n\t"
+ "punpcklhw $f24, $f24, $f8 \n\t"
+ "punpcklhw $f20, $f20, $f4 \n\t"
+ "punpcklhw $f0, $f0, $f24 \n\t"
+ "punpcklhw $f28, $f28, $f12 \n\t"
+ "punpcklhw $f28, $f28, $f20 \n\t"
+ "punpckhhw $f2, $f0, $f28 \n\t"
+ "punpcklhw $f0, $f0, $f28 \n\t"
+ "gssqc1 $f2, $f0, 432-400(%[tmp]) \n\t"
+ "dmtc1 $8, $f0 \n\t"
+ "and %[iAlpha], %[iAlpha], %[iBeta] \n\t"
+ "mov.d $f8, $f0 \n\t"
+ "dmtc1 %[iAlpha], $f16 \n\t"
+ "and %[iAlpha], %[pTC], %[iBeta] \n\t"
+ "dmtc1 $8, $f12 \n\t"
+ "dmtc1 %[iAlpha], $f20 \n\t"
+ "punpcklhw $f20, $f20, $f0 \n\t"
+
+ "xor $f0, $f0, $f0 \n\t"
+ "dmtc1 %[iAlpha], $f24 \n\t"
+ "and %[pTC], %[pTC], %[iBeta] \n\t"
+ "punpcklhw $f24, $f24, $f8 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "dmtc1 %[pTC], $f4 \n\t"
+
+ "gslqc1 $f10, $f8, 0x0($9) \n\t"
+ "punpckhbh $f10, $f8, $f0 \n\t"
+ "punpcklbh $f8, $f8, $f0 \n\t"
+
+ "dli %[iAlpha], 0x4 \n\t"
+ "seh %[pTC], %[iAlpha] \n\t"
+ "punpcklhw $f28, $f28, $f12 \n\t"
+ "punpcklhw $f28, $f28, $f20 \n\t"
+ "gslqc1 $f22, $f20, 0x0(%[iStride]) \n\t"
+ "gslqc1 $f14, $f12, 0x0($13) \n\t"
+ "gsldxc1 $f2, 0x0($12, $0) \n\t"
+ "punpckhbh $f22, $f20, $f0 \n\t"
+ "punpcklbh $f20, $f20, $f0 \n\t"
+ "gssqc1 $f22, $f20, 432-240(%[tmp]) \n\t"
+ "punpckhbh $f22, $f2, $f0 \n\t"
+ "punpcklbh $f20, $f2, $f0 \n\t"
+ "gssqc1 $f22, $f20, 432-352(%[tmp]) \n\t"
+ "punpcklhw $f4, $f4, $f16 \n\t"
+ "gslqc1 $f18, $f16, 0x0($14) \n\t"
+ "punpcklhw $f4, $f4, $f24 \n\t"
+ "gslqc1 $f26, $f24, 0x0(%[pPix]) \n\t"
+ "punpckhhw $f6, $f4, $f28 \n\t"
+ "punpcklhw $f4, $f4, $f28 \n\t"
+ "punpckhbh $f26, $f24, $f0 \n\t"
+ "punpcklbh $f24, $f24, $f0 \n\t"
+ "punpckhbh $f14, $f12, $f0 \n\t"
+ "punpcklbh $f12, $f12, $f0 \n\t"
+ "punpckhbh $f18, $f16, $f0 \n\t"
+ "punpcklbh $f16, $f16, $f0 \n\t"
+ "psubh $f28, $f12, $f16 \n\t"
+ "psubh $f30, $f14, $f18 \n\t"
+ "gssqc1 $f18, $f16, 432-272(%[tmp]) \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f16, $f18)
+ "gslqc1 $f18, $f16, 432-336(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t"
+ "pcmpgth $f20, $f16, $f28 \n\t"
+ "pcmpgth $f22, $f18, $f30 \n\t"
+ "gssqc1 $f22, $f20, 432-288(%[tmp]) \n\t"
+ "psubh $f28, $f24, $f0 \n\t"
+ "psubh $f30, $f26, $f2 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
+ "pcmpgth $f20, $f16, $f28 \n\t"
+ "pcmpgth $f22, $f18, $f30 \n\t"
+ "gssqc1 $f22, $f20, 432-256(%[tmp]) \n\t"
+ "pavgh $f20, $f12, $f24 \n\t"
+ "pavgh $f22, $f14, $f26 \n\t"
+ "gssqc1 $f22, $f20, 432-304(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-288(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 432-256(%[tmp]) \n\t"
+ "psubh $f20, $f20, $f28 \n\t"
+ "psubh $f22, $f22, $f30 \n\t"
+ "psubh $f20, $f20, $f0 \n\t"
+ "psubh $f22, $f22, $f2 \n\t"
+ "gssqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 432-240(%[tmp]) \n\t"
+ "psubh $f20, $f24, $f12 \n\t"
+ "psubh $f22, $f26, $f14 \n\t"
+ "gssqc1 $f26, $f24, 432-32(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f0 \n\t"
+ "psubh $f26, $f26, $f2 \n\t"
+ "gssqc1 $f22, $f20, 432-384(%[tmp]) \n\t"
+ WELS_AbsH($f28, $f30, $f20, $f22, $f28, $f30)
+ "gslqc1 $f22, $f20, 432-112(%[tmp]) \n\t"
+ "pcmpgth $f20, $f20, $f28 \n\t"
+ "pcmpgth $f22, $f22, $f30 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+ "pcmpgth $f28, $f16, $f24 \n\t"
+ "pcmpgth $f30, $f18, $f26 \n\t"
+
+ "xor $f0, $f0, $f0 \n\t"
+ "and $f20, $f20, $f28 \n\t"
+ "and $f22, $f22, $f30 \n\t"
+ "psubh $f24, $f12, $f8 \n\t"
+ "psubh $f26, $f14, $f10 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+ "pcmpgth $f28, $f16, $f24 \n\t"
+ "pcmpgth $f30, $f18, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-400(%[tmp]) \n\t"
+ "and $f20, $f20, $f28 \n\t"
+ "and $f22, $f22, $f30 \n\t"
+ "pcmpgth $f28, $f24, $f0 \n\t"
+ "pcmpgth $f30, $f26, $f0 \n\t"
+ "pcmpeqh $f24, $f24, $f0 \n\t"
+ "pcmpeqh $f26, $f26, $f0 \n\t"
+ "or $f28, $f28, $f24 \n\t"
+ "or $f30, $f30, $f26 \n\t"
+ "and $f20, $f20, $f28 \n\t"
+ "and $f22, $f22, $f30 \n\t"
+ "gssqc1 $f22, $f20, 432-320(%[tmp]) \n\t"
+ "dmtc1 %[pTC], $f20 \n\t"
+ "punpckhhw $f26, $f20, $f20 \n\t"
+ "punpcklhw $f24, $f20, $f20 \n\t"
+ "punpcklwd $f20, $f24, $f24 \n\t"
+ "mov.d $f22, $f20 \n\t"
+ "gssqc1 $f22, $f20, 432-336(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
+ "psubh $f24, $f0, $f20 \n\t"
+ "dli $11, 0x2 \n\t"
+ "psubh $f26, $f0, $f22 \n\t"
+ "dmtc1 $11, $f28 \n\t"
+ "gslqc1 $f22, $f20, 432-384(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 432-240(%[tmp]) \n\t"
+ "psllh $f20, $f20, $f28 \n\t"
+ "psllh $f22, $f22, $f28 \n\t"
+ "psubh $f28, $f8, $f0 \n\t"
+ "psubh $f30, $f10, $f2 \n\t"
+ "paddh $f28, $f28, $f20 \n\t"
+ "paddh $f30, $f30, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-336(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f20 \n\t"
+ "paddh $f30, $f30, $f22 \n\t"
+ "dli $11, 0x3 \n\t"
+ "dmtc1 $11, $f20 \n\t"
+ "psrah $f28, $f28, $f20 \n\t"
+ "psrah $f30, $f30, $f20 \n\t"
+ "gslqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
+ "pmaxsh $f24, $f24, $f28 \n\t"
+ "pmaxsh $f26, $f26, $f30 \n\t"
+ "gslqc1 $f2, $f0, 432-320(%[tmp]) \n\t"
+ "pminsh $f20, $f20, $f24 \n\t"
+ "pminsh $f22, $f22, $f26 \n\t"
+
+ "and $f20, $f20, $f0 \n\t"
+ "and $f22, $f22, $f2 \n\t"
+ "gslqc1 $f26, $f24, 432-400(%[tmp]) \n\t"
+ "gssqc1 $f22, $f20, 432-64(%[tmp]) \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+ "gssqc1 $f26, $f24, 432-384(%[tmp]) \n\t"
+ "psubh $f20, $f0, $f24 \n\t"
+ "psubh $f22, $f0, $f26 \n\t"
+ "gssqc1 $f22, $f20, 432-368(%[tmp]) \n\t"
+ "mov.d $f24, $f20 \n\t"
+ "mov.d $f26, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f28 \n\t"
+ "paddh $f22, $f22, $f30 \n\t"
+ "paddh $f28, $f8, $f8 \n\t"
+ "paddh $f30, $f10, $f10 \n\t"
+ "psubh $f20, $f20, $f28 \n\t"
+ "psubh $f22, $f22, $f30 \n\t"
+ "dli $11, 0x1 \n\t"
+ "dmtc1 $11, $f28 \n\t"
+ "psrah $f20, $f20, $f28 \n\t"
+ "psrah $f22, $f22, $f28 \n\t"
+ "pmaxsh $f24, $f24, $f20 \n\t"
+ "pmaxsh $f26, $f26, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-384(%[tmp]) \n\t"
+ "pminsh $f20, $f20, $f24 \n\t"
+ "pminsh $f22, $f22, $f26 \n\t"
+
+ "gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-288(%[tmp]) \n\t"
+ "and $f20, $f20, $f24 \n\t"
+ "and $f22, $f22, $f26 \n\t"
+ "and $f20, $f20, $f28 \n\t"
+ "and $f22, $f22, $f30 \n\t"
+ "gslqc1 $f26, $f24, 432-240(%[tmp]) \n\t"
+ "gssqc1 $f22, $f20, 432-96(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 432-352(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f28 \n\t"
+ "paddh $f22, $f22, $f30 \n\t"
+ "paddh $f28, $f24, $f24 \n\t"
+ "paddh $f30, $f26, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-368(%[tmp]) \n\t"
+ "dli $11, 0x1 \n\t"
+ "psubh $f20, $f20, $f28 \n\t"
+ "dmtc1 $11, $f28 \n\t"
+ "psubh $f22, $f22, $f30 \n\t"
+
+ "psrah $f20, $f20, $f28 \n\t"
+ "psrah $f22, $f22, $f28 \n\t"
+ "gslqc1 $f30, $f28, 0x0(%[iStride]) \n\t"
+ "pmaxsh $f24, $f24, $f20 \n\t"
+ "pmaxsh $f26, $f26, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
+ "pminsh $f20, $f20, $f24 \n\t"
+ "pminsh $f22, $f22, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t"
+ "and $f20, $f20, $f24 \n\t"
+ "and $f22, $f22, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-256(%[tmp]) \n\t"
+ "and $f20, $f20, $f24 \n\t"
+ "and $f22, $f22, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x0($9) \n\t"
+ "punpcklbh $f28, $f30, $f0 \n\t"
+ "punpckhbh $f30, $f30, $f0 \n\t"
+ "gssqc1 $f30, $f28, 432-352(%[tmp]) \n\t"
+
+ "gslqc1 $f30, $f28, 0x0($12) \n\t"
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "gssqc1 $f22, $f20, 432-48(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 0x0($14) \n\t"
+ "gssqc1 $f26, $f24, 432-368(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x0($13) \n\t"
+ "punpcklbh $f28, $f30, $f0 \n\t"
+ "punpckhbh $f30, $f30, $f0 \n\t"
+ "punpcklbh $f20, $f22, $f0 \n\t"
+ "punpckhbh $f22, $f22, $f0 \n\t"
+ "gssqc1 $f30, $f28, 432-384(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "gssqc1 $f26, $f24, 432-400(%[tmp]) \n\t"
+
+ "gslqc1 $f30, $f28, 432-400(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x0(%[pPix]) \n\t"
+ "psubh $f28, $f28, $f20 \n\t"
+ "psubh $f30, $f30, $f22 \n\t"
+ "gssqc1 $f22, $f20, 432-16(%[tmp]) \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "pcmpgth $f20, $f16, $f28 \n\t"
+ "pcmpgth $f22, $f18, $f30 \n\t"
+ "gslqc1 $f30, $f28, 432-384(%[tmp]) \n\t"
+ "gssqc1 $f22, $f20, 432-288(%[tmp]) \n\t"
+
+ "psubh $f28, $f24, $f28 \n\t"
+ "psubh $f30, $f26, $f30 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
+ "pcmpgth $f20, $f16, $f28 \n\t"
+ "pcmpgth $f22, $f18, $f30 \n\t"
+ "gssqc1 $f22, $f20, 432-256(%[tmp]) \n\t"
+
+ "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 432-80(%[tmp]) \n\t"
+ "pavgh $f20, $f20, $f24 \n\t"
+ "pavgh $f22, $f22, $f26 \n\t"
+ "gssqc1 $f22, $f20, 432-304(%[tmp]) \n\t"
+
+ "gslqc1 $f22, $f20, 432-288(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-256(%[tmp]) \n\t"
+ "psubh $f20, $f4, $f20 \n\t"
+ "psubh $f22, $f6, $f22 \n\t"
+ "psubh $f20, $f20, $f28 \n\t"
+ "psubh $f22, $f22, $f30 \n\t"
+ "gssqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-352(%[tmp]) \n\t"
+ "psubh $f20, $f24, $f20 \n\t"
+ "psubh $f22, $f26, $f22 \n\t"
+ "psubh $f24, $f24, $f28 \n\t"
+ "psubh $f26, $f26, $f30 \n\t"
+ "gssqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
+ "mov.d $f28, $f20 \n\t"
+ "mov.d $f30, $f22 \n\t"
+ WELS_AbsH($f28, $f30, $f20, $f22, $f0, $f2)
+ "gslqc1 $f22, $f20, 432-112(%[tmp]) \n\t"
+ "pcmpgth $f20, $f20, $f28 \n\t"
+ "pcmpgth $f22, $f22, $f30 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+ "pcmpgth $f28, $f16, $f24 \n\t"
+ "pcmpgth $f30, $f18, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-368(%[tmp]) \n\t"
+
+ "and $f20, $f20, $f28 \n\t"
+ "and $f22, $f22, $f30 \n\t"
+ "gslqc1 $f30, $f28, 432-400(%[tmp]) \n\t"
+ "psubh $f28, $f28, $f24 \n\t"
+ "psubh $f30, $f30, $f26 \n\t"
+ "gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f0 \n\t"
+ "psubh $f26, $f26, $f2 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f0, $f2)
+ "pcmpgth $f16, $f16, $f28 \n\t"
+ "pcmpgth $f18, $f18, $f30 \n\t"
+ "gslqc1 $f30, $f28, 432-96(%[tmp]) \n\t"
+ "and $f20, $f20, $f16 \n\t"
+ "and $f22, $f22, $f18 \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+
+ "paddh $f8, $f8, $f28 \n\t"
+ "paddh $f10, $f10, $f30 \n\t"
+ "pcmpgth $f16, $f4, $f0 \n\t"
+ "pcmpgth $f18, $f6, $f0 \n\t"
+ "pcmpeqh $f28, $f4, $f0 \n\t"
+ "pcmpeqh $f30, $f6, $f0 \n\t"
+ "or $f16, $f16, $f28 \n\t"
+ "or $f18, $f18, $f30 \n\t"
+ "and $f20, $f20, $f16 \n\t"
+ "and $f22, $f22, $f18 \n\t"
+ "gslqc1 $f18, $f16, 432-224(%[tmp]) \n\t"
+ "gssqc1 $f22, $f20, 432-320(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
+ "dli $11, 0x2 \n\t"
+ "psubh $f28, $f0, $f16 \n\t"
+ "psubh $f30, $f0, $f18 \n\t"
+ "psubh $f2, $f0, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "dmfc1 %[iAlpha], $f28 \n\t"
+ "dmtc1 $11, $f28 \n\t"
+ "psllh $f20, $f20, $f28 \n\t"
+ "psllh $f22, $f22, $f28 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-336(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-368(%[tmp]) \n\t"
+ "dli $11, 0x3 \n\t"
+ "gssqc1 $f2, $f0, 432-336(%[tmp]) \n\t"
+ "dmfc1 %[iAlpha], $f0 \n\t"
+ "dmtc1 $11, $f0 \n\t"
+ "psrah $f24, $f24, $f0 \n\t"
+ "psrah $f26, $f26, $f0 \n\t"
+ "dmtc1 %[iAlpha], $f0 \n\t"
+ "pmaxsh $f28, $f28, $f24 \n\t"
+ "pmaxsh $f30, $f30, $f26 \n\t"
+ "pminsh $f16, $f16, $f28 \n\t"
+ "pminsh $f18, $f18, $f30 \n\t"
+ "gslqc1 $f30, $f28, 432-320(%[tmp]) \n\t"
+ "and $f16, $f16, $f28 \n\t"
+ "and $f18, $f18, $f30 \n\t"
+ "mov.d $f24, $f0 \n\t"
+ "mov.d $f26, $f2 \n\t"
+ "gslqc1 $f2, $f0, 432-16(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t"
+ "paddh $f0, $f0, $f28 \n\t"
+ "paddh $f2, $f2, $f30 \n\t"
+ "gssqc1 $f18, $f16, 432-272(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 432-368(%[tmp]) \n\t"
+ "dli $11, 0x1 \n\t"
+ "paddh $f16, $f16, $f16 \n\t"
+ "paddh $f18, $f18, $f18 \n\t"
+ "psubh $f0, $f0, $f16 \n\t"
+ "psubh $f2, $f2, $f18 \n\t"
+
+ "dmtc1 $11, $f28 \n\t"
+ "gslqc1 $f18, $f16, 432-64(%[tmp]) \n\t"
+ "psrah $f0, $f0, $f28 \n\t"
+ "psrah $f2, $f2, $f28 \n\t"
+ "pmaxsh $f24, $f24, $f0 \n\t"
+ "pmaxsh $f26, $f26, $f2 \n\t"
+ "gslqc1 $f2, $f0, 432-400(%[tmp]) \n\t"
+ "pminsh $f28, $f4, $f24 \n\t"
+ "pminsh $f30, $f6, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t"
+ "and $f28, $f28, $f24 \n\t"
+ "and $f30, $f30, $f26 \n\t"
+ "dmfc1 %[iAlpha], $f24 \n\t"
+ "dmfc1 %[iBeta], $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-288(%[tmp]) \n\t"
+ "and $f28, $f28, $f24 \n\t"
+ "and $f30, $f30, $f26 \n\t"
+ "paddh $f20, $f20, $f28 \n\t"
+ "paddh $f22, $f22, $f30 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f20, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
+ "paddh $f0, $f0, $f20 \n\t"
+ "paddh $f2, $f2, $f22 \n\t"
+ "paddh $f12, $f12, $f16 \n\t"
+ "paddh $f14, $f14, $f18 \n\t"
+ "packushb $f12, $f12, $f14 \n\t"
+ "packushb $f14, $f0, $f2 \n\t"
+
+ "gslqc1 $f2, $f0, 432-32(%[tmp]) \n\t"
+ "psubh $f0, $f0, $f16 \n\t"
+ "psubh $f2, $f2, $f18 \n\t"
+ "gslqc1 $f18, $f16, 432-80(%[tmp]) \n\t"
+ "psubh $f16, $f16, $f20 \n\t"
+ "gslqc1 $f26, $f24, 432-48(%[tmp]) \n\t"
+ "psubh $f18, $f18, $f22 \n\t"
+
+ "gslqc1 $f22, $f20, 432-240(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f24 \n\t"
+ "paddh $f22, $f22, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-304(%[tmp]) \n\t"
+ "packushb $f0, $f0, $f2 \n\t"
+ "packushb $f2, $f16, $f18 \n\t"
+ "gslqc1 $f18, $f16, 432-384(%[tmp]) \n\t"
+ "paddh $f16, $f16, $f24 \n\t"
+ "paddh $f18, $f18, $f26 \n\t"
+ "gssqc1 $f2, $f0, 480-208(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t"
+ "mov.d $f28, $f0 \n\t"
+ "mov.d $f30, $f2 \n\t"
+ "paddh $f0, $f0, $f0 \n\t"
+ "paddh $f2, $f2, $f2 \n\t"
+
+ "dmtc1 %[iAlpha], $f24 \n\t"
+ "dmtc1 %[iBeta], $f26 \n\t"
+
+ "psubh $f16, $f16, $f0 \n\t"
+ "psubh $f18, $f18, $f2 \n\t"
+ "dli $11, 0x1 \n\t"
+ "gslqc1 $f2, $f0, 432-336(%[tmp]) \n\t"
+ "gssqc1 $f10, $f8, 0x0($9) \n\t"
+ "dmtc1 $11, $f8 \n\t"
+ "psrah $f16, $f16, $f8 \n\t"
+ "psrah $f18, $f18, $f8 \n\t"
+ "pmaxsh $f0, $f0, $f16 \n\t"
+ "pmaxsh $f2, $f2, $f18 \n\t"
+ "pminsh $f4, $f4, $f0 \n\t"
+ "pminsh $f6, $f6, $f2 \n\t"
+ "gslqc1 $f2, $f0, 480-208(%[tmp]) \n\t"
+
+ "gslqc1 $f10, $f8, 428-256+4(%[tmp]) \n\t"
+ "and $f4, $f4, $f24 \n\t"
+ "and $f6, $f6, $f26 \n\t"
+ "and $f4, $f4, $f8 \n\t"
+ "and $f6, $f6, $f10 \n\t"
+ "gssqc1 $f14, $f12, 0x0($13) \n\t"
+ "paddh $f28, $f28, $f4 \n\t"
+ "paddh $f30, $f30, $f6 \n\t"
+ "packushb $f20, $f20, $f22 \n\t"
+ "packushb $f22, $f28, $f30 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPix]) \n\t"
+ "gssqc1 $f22, $f20, 0x0(%[iStride]) \n\t"
+ : [pPix]"+&r"((unsigned char *)pPix)
+ : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
+ [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
+ "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+ "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockLumaTransposeH2V_mmi(uint8_t *pPixY, int32_t iStride,
+ uint8_t *pDst) {
+ BACKUP_REG;
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "dsll $8, %[iStride], 0x3 \n\t"
+ "daddu $8, $8, %[pPixY] \n\t"
+
+ "daddu $9, %[pPixY], %[iStride] \n\t"
+ "daddu $10, $8, %[iStride] \n\t"
+ "gsldlc1 $f0, 0x7(%[pPixY]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldlc1 $f4, 0x7($9) \n\t"
+ "gsldlc1 $f6, 0x7($10) \n\t"
+ "gsldrc1 $f0, 0x0(%[pPixY]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "gsldrc1 $f4, 0x0($9) \n\t"
+ "gsldrc1 $f6, 0x0($10) \n\t"
+ "daddu %[pPixY], $9, %[iStride] \n\t"
+ "daddu $8, $10, %[iStride] \n\t"
+ "daddu $9, %[pPixY], %[iStride] \n\t"
+ "daddu $10, $8, %[iStride] \n\t"
+ "gsldlc1 $f8, 0x7(%[pPixY]) \n\t"
+ "gsldlc1 $f10, 0x7($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7($10) \n\t"
+ "gsldrc1 $f8, 0x0(%[pPixY]) \n\t"
+ "gsldrc1 $f10, 0x0($8) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0($10) \n\t"
+
+ "daddu %[pPixY], $9, %[iStride] \n\t"
+ "daddu $8, $10, %[iStride] \n\t"
+ "daddu $9, %[pPixY], %[iStride] \n\t"
+ "daddu $10, $8, %[iStride] \n\t"
+ "gsldlc1 $f16, 0x7(%[pPixY]) \n\t"
+ "gsldlc1 $f18, 0x7($8) \n\t"
+ "gsldlc1 $f20, 0x7($9) \n\t"
+ "gsldlc1 $f22, 0x7($10) \n\t"
+ "gsldrc1 $f16, 0x0(%[pPixY]) \n\t"
+ "gsldrc1 $f18, 0x0($8) \n\t"
+ "gsldrc1 $f20, 0x0($9) \n\t"
+ "gsldrc1 $f22, 0x0($10) \n\t"
+ "daddu %[pPixY], $9, %[iStride] \n\t"
+ "daddu $8, $10, %[iStride] \n\t"
+ "daddu $9, %[pPixY], %[iStride] \n\t"
+ "daddu $10, $8, %[iStride] \n\t"
+ "gsldlc1 $f24, 0x7(%[pPixY]) \n\t"
+ "gsldlc1 $f26, 0x7($8) \n\t"
+
+ "gsldlc1 $f28, 0x7($9) \n\t"
+ "gsldlc1 $f30, 0x7($10) \n\t"
+ "gsldrc1 $f24, 0x0(%[pPixY]) \n\t"
+ "gsldrc1 $f26, 0x0($8) \n\t"
+ "gsldrc1 $f28, 0x0($9) \n\t"
+ "gsldrc1 $f30, 0x0($10) \n\t"
+
+ MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
+ $f14, $f16, $f18, $f20, $f22, $f24,
+ $f26, $f28, $f30, $9, $10)
+
+ "gssqc1 $f18, $f16, 0x0(%[pDst]) \n\t"
+ "gssqc1 $f10, $f8, 0x10(%[pDst]) \n\t"
+ "gssqc1 $f14, $f12, 0x20(%[pDst]) \n\t"
+ "gssqc1 $f30, $f28, 0x30(%[pDst]) \n\t"
+ "gssqc1 $f22, $f20, 0x40(%[pDst]) \n\t"
+ "gssqc1 $f6, $f4, 0x50(%[pDst]) \n\t"
+ "gssqc1 $f26, $f24, 0x60(%[pDst]) \n\t"
+ "gssqc1 $f2, $f0, 0x70(%[pDst]) \n\t"
+ : [pPixY] "+&r"((unsigned char *)pPixY)
+ : [iStride] "r"((int)iStride), [pDst] "r"((unsigned char *)pDst)
+ : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
+ "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
+ "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockLumaTransposeV2H_mmi(uint8_t *pPixY, int32_t iStride,
+ uint8_t *pSrc) {
+ BACKUP_REG;
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pSrc]) \n\t"
+ "gslqc1 $f6, $f4, 0x10(%[pSrc]) \n\t"
+ "gslqc1 $f10, $f8, 0x20(%[pSrc]) \n\t"
+ "gslqc1 $f14, $f12, 0x30(%[pSrc]) \n\t"
+ "gslqc1 $f18, $f16, 0x40(%[pSrc]) \n\t"
+ "gslqc1 $f22, $f20, 0x50(%[pSrc]) \n\t"
+ "gslqc1 $f26, $f24, 0x60(%[pSrc]) \n\t"
+ "gslqc1 $f30, $f28, 0x70(%[pSrc]) \n\t"
+
+ MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
+ $f14, $f16, $f18, $f20, $f22, $f24,
+ $f26, $f28, $f30, $9, $10)
+
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f16, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f8, 0x7($8) \n\t"
+ "gssdrc1 $f16, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f8, 0x0($8) \n\t"
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f12, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f28, 0x7($8) \n\t"
+ "gssdrc1 $f12, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f28, 0x0($8) \n\t"
+
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f20, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f4, 0x7($8) \n\t"
+ "gssdrc1 $f20, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f4, 0x0($8) \n\t"
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f24, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f0, 0x7($8) \n\t"
+ "gssdrc1 $f24, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f0, 0x0($8) \n\t"
+
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f18, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f10, 0x7($8) \n\t"
+ "gssdrc1 $f18, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f10, 0x0($8) \n\t"
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f14, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f30, 0x7($8) \n\t"
+ "gssdrc1 $f14, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f30, 0x0($8) \n\t"
+
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f22, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f6, 0x7($8) \n\t"
+ "gssdrc1 $f22, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f6, 0x0($8) \n\t"
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f26, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f2, 0x7($8) \n\t"
+ "gssdrc1 $f26, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f2, 0x0($8) \n\t"
+ : [pPixY] "+&r"((unsigned char *)pPixY)
+ : [iStride] "r"((int)iStride), [pSrc] "r"((unsigned char *)pSrc)
+ : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
+ "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
+ "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockLumaEq4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
+ int32_t iBeta) {
+ unsigned char tmp[720] __attribute__((aligned(32)));
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "dsll $11, %[iStride], 0x2 \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+ "daddu $14, %[iStride], %[pPix] \n\t"
+ "dsubu $8, %[pPix], $11 \n\t"
+ "gslqc1 $f14, $f12, 0x0($8) \n\t"
+ "gslqc1 $f22, $f20, 0x0(%[pPix]) \n\t"
+ "daddu $9, %[iStride], %[iStride] \n\t"
+ "daddu $10, $9, %[iStride] \n\t"
+ "move $12, $9 \n\t"
+ "dsubu $8, %[pPix], $9 \n\t"
+ "gslqc1 $f6, $f4, 0x0($8) \n\t"
+ "dsubu $9, %[pPix], %[iStride] \n\t"
+ "gslqc1 $f18, $f16, 0x0($9) \n\t"
+ "daddu $13, %[iStride], %[pPix] \n\t"
+
+ "move %[iStride], $12 \n\t"
+ "daddu $15, $12, %[pPix] \n\t"
+
+ "daddu $12, %[pPix], $10 \n\t"
+ "dsubu $11, %[pPix], $10 \n\t"
+
+ "gslqc1 $f26, $f24, 0x0($11) \n\t"
+ "daddu %[iStride], %[iStride], %[pPix] \n\t"
+ "dmtc1 %[iAlpha], $f0 \n\t"
+
+ "punpcklhw $f28, $f0, $f0 \n\t"
+ "punpcklwd $f0, $f28, $f28 \n\t"
+ "mov.d $f2, $f0 \n\t"
+ "gssqc1 $f2, $f0, 640-320(%[tmp]) \n\t"
+ "dmtc1 %[iBeta], $f0 \n\t"
+ "gsldxc1 $f10, 0x0($15, $0) \n\t"
+ "punpcklhw $f28, $f0, $f0 \n\t"
+ "punpcklwd $f0, $f28, $f28 \n\t"
+ "punpckhbh $f30, $f10, $f8 \n\t"
+ "mov.d $f2, $f0 \n\t"
+
+ "punpcklbh $f28, $f10, $f8 \n\t"
+ "gssqc1 $f2, $f0, 640-512(%[tmp]) \n\t"
+ "gssqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
+ "mov.d $f0, $f4 \n\t"
+ "gssqc1 $f22, $f20, 704-272(%[tmp]) \n\t"
+ "gssqc1 $f6, $f4, 672-272(%[tmp]) \n\t"
+ "mov.d $f4, $f16 \n\t"
+ "punpckhbh $f22, $f20, $f8 \n\t"
+ "punpcklbh $f20, $f20, $f8 \n\t"
+ "punpckhbh $f6, $f4, $f8 \n\t"
+ "punpcklbh $f4, $f4, $f8 \n\t"
+
+ "psubh $f28, $f20, $f4 \n\t"
+ "psubh $f30, $f22, $f6 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f2, $f10)
+ "gssqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
+ "punpckhbh $f2, $f0, $f8 \n\t"
+ "punpcklbh $f0, $f0, $f8 \n\t"
+ "gssqc1 $f18, $f16, 688-272(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 0x0($14) \n\t"
+ "gssqc1 $f2, $f0, 640-480(%[tmp]) \n\t"
+
+ "psubh $f28, $f4, $f0 \n\t"
+ "psubh $f30, $f6, $f2 \n\t"
+
+ "gslqc1 $f2, $f0, 640-512(%[tmp]) \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f18, $f10)
+ "punpckhbh $f18, $f16, $f8 \n\t"
+ "punpcklbh $f16, $f16, $f8 \n\t"
+ "pcmpgth $f0, $f0, $f28 \n\t"
+ "pcmpgth $f2, $f2, $f30 \n\t"
+ "gssqc1 $f18, $f16, 640-384(%[tmp]) \n\t"
+ "psubh $f28, $f20, $f16 \n\t"
+ "psubh $f30, $f22, $f18 \n\t"
+ "gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 656-272(%[tmp]) \n\t"
+ "punpckhbh $f26, $f24, $f8 \n\t"
+ "punpcklbh $f24, $f24, $f8 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "gssqc1 $f26, $f24, 640-368(%[tmp]) \n\t"
+ "gssqc1 $f6, $f4, 640-144(%[tmp]) \n\t"
+ "gssqc1 $f22, $f20, 640-400(%[tmp]) \n\t"
+ "pcmpgth $f16, $f16, $f28 \n\t"
+ "pcmpgth $f18, $f18, $f30 \n\t"
+ "and $f0, $f0, $f16 \n\t"
+ "and $f2, $f2, $f18 \n\t"
+ "gslqc1 $f18, $f16, 640-320(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
+ "dli %[iAlpha], 0x2 \n\t"
+ "dli %[iBeta], 0x2 \n\t"
+ "pcmpgth $f16, $f16, $f28 \n\t"
+ "pcmpgth $f18, $f18, $f30 \n\t"
+ "and $f0, $f0, $f16 \n\t"
+ "and $f2, $f2, $f18 \n\t"
+ "dmtc1 %[iAlpha], $f16 \n\t"
+ "dmtc1 %[iBeta], $f10 \n\t"
+ "gssqc1 $f2, $f0, 736-272(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 640-320(%[tmp]) \n\t"
+
+ "punpcklhw $f28, $f16, $f16 \n\t"
+ "psrah $f16, $f0, $f10 \n\t"
+ "psrah $f18, $f2, $f10 \n\t"
+ "punpcklwd $f28, $f28, $f28 \n\t"
+ "mov.d $f30, $f28 \n\t"
+ "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
+ "paddh $f16, $f16, $f28 \n\t"
+ "paddh $f18, $f18, $f30 \n\t"
+ "gssqc1 $f18, $f16, 640-576(%[tmp]) \n\t"
+ "pcmpgth $f16, $f16, $f8 \n\t"
+ "pcmpgth $f18, $f18, $f10 \n\t"
+ "gssqc1 $f18, $f16, 640-560(%[tmp]) \n\t"
+
+ "gssqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t"
+ "psubh $f28, $f4, $f24 \n\t"
+ "psubh $f30, $f6, $f26 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
+ "pcmpgth $f16, $f16, $f28 \n\t"
+ "pcmpgth $f18, $f18, $f30 \n\t"
+
+ "gslqc1 $f2, $f0, 640-416(%[tmp]) \n\t"
+ "and $f16, $f16, $f8 \n\t"
+ "and $f18, $f18, $f10 \n\t"
+ "gssqc1 $f18, $f16, 640-544(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t"
+ "psubh $f28, $f20, $f0 \n\t"
+ "psubh $f30, $f22, $f2 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
+ "pcmpgth $f16, $f16, $f28 \n\t"
+ "pcmpgth $f18, $f18, $f30 \n\t"
+
+ "and $f16, $f16, $f8 \n\t"
+ "and $f18, $f18, $f10 \n\t"
+ "gssqc1 $f18, $f16, 640-560(%[tmp]) \n\t"
+
+ "gslqc1 $f18, $f16, 640-544(%[tmp]) \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+ "pandn $f16, $f16, $f24 \n\t"
+ "dli %[iAlpha], 0x4 \n\t"
+ "pandn $f18, $f18, $f26 \n\t"
+ "gssqc1 $f18, $f16, 640-16(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f16 \n\t"
+ "punpcklhw $f28, $f16, $f16 \n\t"
+ "dli %[iAlpha], 0x1 \n\t"
+ "punpckhbh $f18, $f12, $f8 \n\t"
+ "dmtc1 %[iAlpha], $f30 \n\t"
+ "punpcklbh $f16, $f12, $f8 \n\t"
+ "psllh $f16, $f16, $f30 \n\t"
+ "psllh $f18, $f18, $f30 \n\t"
+ "paddh $f16, $f16, $f24 \n\t"
+ "paddh $f18, $f18, $f26 \n\t"
+ "gslqc1 $f2, $f0, 640-480(%[tmp]) \n\t"
+ "paddh $f16, $f16, $f24 \n\t"
+ "paddh $f18, $f18, $f26 \n\t"
+ "paddh $f16, $f16, $f24 \n\t"
+ "paddh $f18, $f18, $f26 \n\t"
+ "paddh $f16, $f16, $f0 \n\t"
+ "paddh $f18, $f18, $f2 \n\t"
+
+ "gslqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
+ "punpcklwd $f28, $f28, $f28 \n\t"
+ "mov.d $f30, $f28 \n\t"
+ "paddh $f16, $f16, $f4 \n\t"
+ "paddh $f18, $f18, $f6 \n\t"
+ "gssqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
+ "paddh $f16, $f16, $f20 \n\t"
+ "paddh $f18, $f18, $f22 \n\t"
+ "paddh $f16, $f16, $f28 \n\t"
+ "paddh $f18, $f18, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 640-384(%[tmp]) \n\t"
+ "pandn $f24, $f24, $f28 \n\t"
+ "pandn $f26, $f26, $f30 \n\t"
+ "gssqc1 $f26, $f24, 640-80(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x0($12) \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "punpckhbh $f26, $f24, $f8 \n\t"
+ "punpcklbh $f24, $f24, $f8 \n\t"
+ "psllh $f24, $f24, $f10 \n\t"
+ "psllh $f26, $f26, $f10 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "paddh $f24, $f24, $f0 \n\t"
+ "paddh $f26, $f26, $f2 \n\t"
+
+ "dli %[iAlpha], 0x3 \n\t"
+ "gslqc1 $f30, $f28, 640-480(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 640-592(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "paddh $f24, $f24, $f0 \n\t"
+ "paddh $f26, $f26, $f2 \n\t"
+ "gslqc1 $f2, $f0, 640-560(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "psrah $f24, $f24, $f10 \n\t"
+ "psrah $f26, $f26, $f10 \n\t"
+ "and $f24, $f24, $f0 \n\t"
+ "and $f26, $f26, $f2 \n\t"
+ "gssqc1 $f26, $f24, 640-112(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
+ "pandn $f24, $f24, $f28 \n\t"
+ "pandn $f26, $f26, $f30 \n\t"
+ "gssqc1 $f26, $f24, 640-336(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 640-528(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-368(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 640-544(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "psrah $f16, $f16, $f10 \n\t"
+ "psrah $f18, $f18, $f10 \n\t"
+ "and $f16, $f16, $f0 \n\t"
+ "and $f18, $f18, $f2 \n\t"
+ "gslqc1 $f2, $f0, 640-624(%[tmp]) \n\t"
+ "paddh $f28, $f4, $f20 \n\t"
+ "paddh $f30, $f6, $f22 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "paddh $f24, $f24, $f0 \n\t"
+ "paddh $f26, $f26, $f2 \n\t"
+ "gslqc1 $f30, $f28, 640-528(%[tmp]) \n\t"
+ "dli %[iAlpha], 0x2 \n\t"
+
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "paddh $f20, $f20, $f4 \n\t"
+ "paddh $f22, $f22, $f6 \n\t"
+ "psrah $f24, $f24, $f10 \n\t"
+ "psrah $f26, $f26, $f10 \n\t"
+ "and $f28, $f28, $f24 \n\t"
+ "and $f30, $f30, $f26 \n\t"
+
+ "gslqc1 $f26, $f24, 640-384(%[tmp]) \n\t"
+ "gssqc1 $f30, $f28, 640-64(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
+ "pandn $f28, $f28, $f24 \n\t"
+ "pandn $f30, $f30, $f26 \n\t"
+ "gssqc1 $f30, $f28, 640-304(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f24 \n\t"
+ "paddh $f30, $f30, $f26 \n\t"
+ "paddh $f28, $f28, $f20 \n\t"
+ "paddh $f30, $f30, $f22 \n\t"
+ "paddh $f28, $f28, $f8 \n\t"
+ "paddh $f30, $f30, $f10 \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "gslqc1 $f22, $f20, 640-560(%[tmp]) \n\t"
+ "psrah $f28, $f28, $f10 \n\t"
+ "psrah $f30, $f30, $f10 \n\t"
+ "and $f20, $f20, $f28 \n\t"
+ "and $f22, $f22, $f30 \n\t"
+ "gssqc1 $f22, $f20, 640-32(%[tmp]) \n\t"
+
+ "gslqc1 $f22, $f20, 640-480(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 640-592(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t"
+ "paddh $f28, $f20, $f20 \n\t"
+ "paddh $f30, $f22, $f22 \n\t"
+ "paddh $f20, $f4, $f24 \n\t"
+ "paddh $f22, $f6, $f26 \n\t"
+ "paddh $f24, $f24, $f0 \n\t"
+ "paddh $f26, $f26, $f2 \n\t"
+ "paddh $f28, $f28, $f20 \n\t"
+ "paddh $f30, $f30, $f22 \n\t"
+ "paddh $f28, $f28, $f8 \n\t"
+ "paddh $f30, $f30, $f10 \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "gslqc1 $f22, $f20, 640-544(%[tmp]) \n\t"
+ "psrah $f28, $f28, $f10 \n\t"
+ "psrah $f30, $f30, $f10 \n\t"
+ "dli %[iAlpha], 0x1 \n\t"
+ "pandn $f20, $f20, $f28 \n\t"
+ "pandn $f22, $f22, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-480(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f4 \n\t"
+ "paddh $f30, $f30, $f6 \n\t"
+ "gslqc1 $f6, $f4, 640-400(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f4 \n\t"
+ "paddh $f30, $f30, $f6 \n\t"
+ "gslqc1 $f6, $f4, 640-544(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "gssqc1 $f22, $f20, 640-352(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 640-368(%[tmp]) \n\t"
+ "psllh $f28, $f28, $f10 \n\t"
+ "psllh $f30, $f30, $f10 \n\t"
+ "dli %[iAlpha], 0x3 \n\t"
+ "paddh $f28, $f28, $f24 \n\t"
+ "paddh $f30, $f30, $f26 \n\t"
+ "paddh $f20, $f20, $f28 \n\t"
+ "paddh $f22, $f22, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+
+ "dli %[iAlpha], 0x2 \n\t"
+ "gslqc1 $f30, $f28, 640-400(%[tmp]) \n\t"
+ "psrah $f20, $f20, $f10 \n\t"
+ "psrah $f22, $f22, $f10 \n\t"
+ "and $f4, $f4, $f20 \n\t"
+ "and $f6, $f6, $f22 \n\t"
+ "gslqc1 $f22, $f20, 640-480(%[tmp]) \n\t"
+ "gssqc1 $f6, $f4, 640-96(%[tmp]) \n\t"
+ "gslqc1 $f6, $f4, 640-384(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-400(%[tmp]) \n\t"
+ "paddh $f24, $f4, $f4 \n\t"
+ "paddh $f26, $f6, $f6 \n\t"
+ "paddh $f4, $f4, $f8 \n\t"
+ "paddh $f6, $f6, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-144(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f20 \n\t"
+ "paddh $f30, $f30, $f22 \n\t"
+ "paddh $f4, $f4, $f8 \n\t"
+ "paddh $f6, $f6, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-592(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "paddh $f20, $f20, $f8 \n\t"
+ "paddh $f22, $f22, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "dmtc1 %[iAlpha], $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+ "dli %[iAlpha], 0x1 \n\t"
+ "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "psrah $f24, $f24, $f8 \n\t"
+ "psrah $f26, $f26, $f8 \n\t"
+ "psllh $f4, $f4, $f10 \n\t"
+ "psllh $f6, $f6, $f10 \n\t"
+ "paddh $f4, $f4, $f20 \n\t"
+ "paddh $f6, $f6, $f22 \n\t"
+ "dli %[iAlpha], 0x3 \n\t"
+
+ "gslqc1 $f22, $f20, 656-272(%[tmp]) \n\t"
+ "pandn $f28, $f28, $f24 \n\t"
+ "pandn $f30, $f30, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-416(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "gslqc1 $f6, $f4, 640-560(%[tmp]) \n\t"
+ "psrah $f24, $f24, $f10 \n\t"
+ "psrah $f26, $f26, $f10 \n\t"
+ "and $f4, $f4, $f24 \n\t"
+ "and $f6, $f6, $f26 \n\t"
+
+ "xor $f8, $f8, $f8 \n\t"
+ "gslqc1 $f26, $f24, 704-272(%[tmp]) \n\t"
+ "gssqc1 $f6, $f4, 640-128(%[tmp]) \n\t"
+ "gslqc1 $f6, $f4, 672-272(%[tmp]) \n\t"
+ "punpcklbh $f4, $f6, $f8 \n\t"
+ "punpckhbh $f6, $f6, $f8 \n\t"
+ "gssqc1 $f6, $f4, 640-448(%[tmp]) \n\t"
+ "gslqc1 $f6, $f4, 688-272(%[tmp]) \n\t"
+ "punpcklbh $f4, $f6, $f8 \n\t"
+ "punpckhbh $f6, $f6, $f8 \n\t"
+ "punpcklbh $f24, $f26, $f8 \n\t"
+ "punpckhbh $f26, $f26, $f8 \n\t"
+ "gssqc1 $f30, $f28, 640-288(%[tmp]) \n\t"
+ "punpcklbh $f20, $f22, $f8 \n\t"
+ "punpckhbh $f22, $f22, $f8 \n\t"
+ "gslqc1 $f30, $f28, 0x0($14) \n\t"
+ "gssqc1 $f6, $f4, 640-496(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 640-432(%[tmp]) \n\t"
+
+ "gsldxc1 $f0, 0x8($15, $0) \n\t"
+ "punpcklbh $f28, $f30, $f8 \n\t"
+ "punpckhbh $f30, $f30, $f8 \n\t"
+ "gssqc1 $f30, $f28, 640-464(%[tmp]) \n\t"
+
+ "punpcklbh $f28, $f0, $f8 \n\t"
+ "punpckhbh $f30, $f0, $f8 \n\t"
+ "gslqc1 $f10, $f8, 640-464(%[tmp]) \n\t"
+ "gssqc1 $f30, $f28, 640-528(%[tmp]) \n\t"
+
+ "psubh $f28, $f24, $f4 \n\t"
+ "psubh $f30, $f26, $f6 \n\t"
+ "psubh $f24, $f24, $f8 \n\t"
+ "psubh $f26, $f26, $f10 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "gslqc1 $f10, $f8, 640-16(%[tmp]) \n\t"
+ "gssqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
+ "or $f16, $f16, $f8 \n\t"
+ "or $f18, $f18, $f10 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+ "gslqc1 $f30, $f28, 640-448(%[tmp]) \n\t"
+ "psubh $f28, $f4, $f28 \n\t"
+ "psubh $f30, $f6, $f30 \n\t"
+
+ "gslqc1 $f2, $f0, 640-512(%[tmp]) \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "pcmpgth $f4, $f0, $f28 \n\t"
+ "pcmpgth $f6, $f2, $f30 \n\t"
+ "pcmpgth $f28, $f0, $f24 \n\t"
+ "pcmpgth $f30, $f2, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-320(%[tmp]) \n\t"
+ "and $f4, $f4, $f28 \n\t"
+ "and $f6, $f6, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
+ "pcmpgth $f24, $f24, $f28 \n\t"
+ "pcmpgth $f26, $f26, $f30 \n\t"
+ "and $f4, $f4, $f24 \n\t"
+ "and $f6, $f6, $f26 \n\t"
+
+ "gslqc1 $f26, $f24, 640-576(%[tmp]) \n\t"
+ "pcmpgth $f24, $f24, $f28 \n\t"
+ "pcmpgth $f26, $f26, $f30 \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+ "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
+ "punpcklbh $f12, $f14, $f8 \n\t"
+ "punpckhbh $f14, $f14, $f8 \n\t"
+ "gssqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-512(%[tmp]) \n\t"
+ "psubh $f28, $f28, $f20 \n\t"
+ "psubh $f30, $f30, $f22 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "pcmpgth $f24, $f24, $f28 \n\t"
+ "pcmpgth $f26, $f26, $f30 \n\t"
+
+ "dli %[iAlpha], 0x1 \n\t"
+ "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
+ "and $f24, $f24, $f8 \n\t"
+ "and $f26, $f26, $f10 \n\t"
+ "gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t"
+ "psubh $f28, $f28, $f8 \n\t"
+ "psubh $f30, $f30, $f10 \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+
+ "psllh $f12, $f12, $f10 \n\t"
+ "psllh $f14, $f14, $f10 \n\t"
+ "gssqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-512(%[tmp]) \n\t"
+
+ "gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f20 \n\t"
+ "paddh $f14, $f14, $f22 \n\t"
+ "paddh $f12, $f12, $f20 \n\t"
+ "paddh $f14, $f14, $f22 \n\t"
+ "paddh $f12, $f12, $f20 \n\t"
+ "paddh $f14, $f14, $f22 \n\t"
+ "paddh $f12, $f12, $f8 \n\t"
+ "paddh $f14, $f14, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 640-560(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f8 \n\t"
+ "paddh $f14, $f14, $f10 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "pcmpgth $f24, $f24, $f28 \n\t"
+ "pcmpgth $f26, $f26, $f30 \n\t"
+ "and $f24, $f24, $f0 \n\t"
+ "and $f26, $f26, $f2 \n\t"
+ "gssqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-544(%[tmp]) \n\t"
+
+ "gslqc1 $f2, $f0, 736-272(%[tmp]) \n\t"
+ "dli %[iAlpha], 0x3 \n\t"
+ "gslqc1 $f30, $f28, 640-368(%[tmp]) \n\t"
+ "and $f24, $f0, $f16 \n\t"
+ "and $f26, $f2, $f18 \n\t"
+ "pandn $f16, $f0, $f28 \n\t"
+ "pandn $f18, $f2, $f30 \n\t"
+ "or $f24, $f24, $f16 \n\t"
+ "or $f26, $f26, $f18 \n\t"
+ "gslqc1 $f18, $f16, 640-432(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f16 \n\t"
+ "paddh $f14, $f14, $f18 \n\t"
+ "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f28 \n\t"
+ "paddh $f14, $f14, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "psrah $f12, $f12, $f28 \n\t"
+ "psrah $f14, $f14, $f28 \n\t"
+ "and $f12, $f12, $f8 \n\t"
+ "and $f14, $f14, $f10 \n\t"
+ "pandn $f8, $f8, $f20 \n\t"
+ "pandn $f10, $f10, $f22 \n\t"
+ "or $f12, $f12, $f8 \n\t"
+ "or $f14, $f14, $f10 \n\t"
+ "and $f28, $f4, $f12 \n\t"
+ "and $f30, $f6, $f14 \n\t"
+ "gslqc1 $f14, $f12, 640-64(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-336(%[tmp]) \n\t"
+ "or $f12, $f12, $f8 \n\t"
+ "or $f14, $f14, $f10 \n\t"
+ "pandn $f8, $f4, $f20 \n\t"
+ "pandn $f10, $f6, $f22 \n\t"
+ "or $f28, $f28, $f8 \n\t"
+ "or $f30, $f30, $f10 \n\t"
+
+ "dli %[iAlpha], 0x2 \n\t"
+ "and $f8, $f0, $f12 \n\t"
+ "and $f10, $f2, $f14 \n\t"
+ "gslqc1 $f14, $f12, 640-480(%[tmp]) \n\t"
+ "pandn $f12, $f0, $f12 \n\t"
+ "pandn $f14, $f2, $f14 \n\t"
+ "or $f8, $f8, $f12 \n\t"
+ "or $f10, $f10, $f14 \n\t"
+ "packushb $f24, $f24, $f26 \n\t"
+ "packushb $f26, $f28, $f30 \n\t"
+ "gssqc1 $f10, $f8, 640-336(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 656-272(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t"
+ "paddh $f8, $f20, $f8 \n\t"
+ "paddh $f10, $f22, $f10 \n\t"
+ "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f16 \n\t"
+ "paddh $f30, $f30, $f18 \n\t"
+ "paddh $f8, $f8, $f28 \n\t"
+ "paddh $f10, $f10, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f28 \n\t"
+ "paddh $f10, $f10, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "psrah $f8, $f8, $f28 \n\t"
+ "psrah $f10, $f10, $f28 \n\t"
+ "dli %[iAlpha], 0x1 \n\t"
+ "gslqc1 $f30, $f28, 640-544(%[tmp]) \n\t"
+ "and $f24, $f24, $f8 \n\t"
+ "and $f26, $f26, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t"
+ "pandn $f28, $f28, $f8 \n\t"
+ "pandn $f30, $f30, $f10 \n\t"
+ "or $f24, $f24, $f28 \n\t"
+ "or $f26, $f26, $f30 \n\t"
+ "and $f12, $f4, $f24 \n\t"
+ "and $f14, $f6, $f26 \n\t"
+ "pandn $f24, $f4, $f8 \n\t"
+ "pandn $f26, $f6, $f10 \n\t"
+ "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f28 \n\t"
+ "paddh $f10, $f10, $f30 \n\t"
+ "paddh $f8, $f8, $f16 \n\t"
+ "paddh $f10, $f10, $f18 \n\t"
+ "or $f12, $f12, $f24 \n\t"
+ "or $f14, $f14, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-336(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "packushb $f24, $f24, $f26 \n\t"
+ "packushb $f26, $f12, $f14 \n\t"
+ "psllh $f8, $f8, $f28 \n\t"
+ "psllh $f10, $f10, $f28 \n\t"
+ "gssqc1 $f26, $f24, 672-272(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-96(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-352(%[tmp]) \n\t"
+ "or $f24, $f24, $f28 \n\t"
+ "or $f26, $f26, $f30 \n\t"
+ "dli %[iAlpha], 0x3 \n\t"
+
+ "and $f12, $f0, $f24 \n\t"
+ "and $f14, $f2, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-144(%[tmp]) \n\t"
+ "pandn $f24, $f0, $f24 \n\t"
+ "pandn $f26, $f2, $f26 \n\t"
+ "or $f12, $f12, $f24 \n\t"
+ "or $f14, $f14, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
+ "gssqc1 $f14, $f12, 640-352(%[tmp]) \n\t"
+ "gslqc1 $f14, $f12, 640-464(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f28 \n\t"
+ "paddh $f14, $f14, $f30 \n\t"
+ "paddh $f8, $f8, $f12 \n\t"
+ "paddh $f10, $f10, $f14 \n\t"
+ "gslqc1 $f14, $f12, 640-448(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f8 \n\t"
+ "paddh $f22, $f22, $f10 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t"
+ "psrah $f20, $f20, $f28 \n\t"
+ "psrah $f22, $f22, $f28 \n\t"
+ "and $f24, $f24, $f20 \n\t"
+ "and $f26, $f26, $f22 \n\t"
+ "gslqc1 $f22, $f20, 640-464(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f20 \n\t"
+ "paddh $f10, $f10, $f22 \n\t"
+ "gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t"
+ "dli %[iAlpha], 0x2 \n\t"
+ "paddh $f20, $f20, $f28 \n\t"
+ "paddh $f22, $f22, $f30 \n\t"
+ "paddh $f16, $f12, $f12 \n\t"
+ "paddh $f18, $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f8 \n\t"
+ "paddh $f18, $f18, $f10 \n\t"
+ "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
+ "paddh $f16, $f16, $f28 \n\t"
+ "paddh $f18, $f18, $f30 \n\t"
+ "gslqc1 $f10, $f8, 640-544(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f28 \n\t"
+ "paddh $f14, $f14, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "psrah $f16, $f16, $f28 \n\t"
+ "psrah $f18, $f18, $f28 \n\t"
+ "pandn $f8, $f8, $f16 \n\t"
+ "pandn $f10, $f10, $f18 \n\t"
+ "or $f24, $f24, $f8 \n\t"
+ "or $f26, $f26, $f10 \n\t"
+ "and $f28, $f4, $f24 \n\t"
+ "and $f30, $f6, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-496(%[tmp]) \n\t"
+ "pandn $f8, $f4, $f24 \n\t"
+ "pandn $f10, $f6, $f26 \n\t"
+ "or $f28, $f28, $f8 \n\t"
+ "or $f30, $f30, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-352(%[tmp]) \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f28, $f30 \n\t"
+ "gssqc1 $f10, $f8, 688-272(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-128(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-288(%[tmp]) \n\t"
+ "or $f8, $f8, $f28 \n\t"
+ "or $f10, $f10, $f30 \n\t"
+ "dli %[iAlpha], 0x1 \n\t"
+
+ "and $f16, $f0, $f8 \n\t"
+ "and $f18, $f2, $f10 \n\t"
+ "paddh $f20, $f20, $f24 \n\t"
+ "paddh $f22, $f22, $f26 \n\t"
+ "gslqc1 $f30, $f28, 640-400(%[tmp]) \n\t"
+ "pandn $f8, $f0, $f28 \n\t"
+ "pandn $f10, $f2, $f30 \n\t"
+ "or $f16, $f16, $f8 \n\t"
+ "or $f18, $f18, $f10 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t"
+ "dli %[iAlpha], 0x3 \n\t"
+ "psllh $f20, $f20, $f28 \n\t"
+ "psllh $f22, $f22, $f28 \n\t"
+ "paddh $f20, $f20, $f12 \n\t"
+ "paddh $f22, $f22, $f14 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "gslqc1 $f14, $f12, 640-560(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f20 \n\t"
+ "paddh $f10, $f10, $f22 \n\t"
+ "psrah $f8, $f8, $f28 \n\t"
+ "psrah $f10, $f10, $f28 \n\t"
+ "gssqc1 $f18, $f16, 640-288(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 640-560(%[tmp]) \n\t"
+ "and $f16, $f16, $f8 \n\t"
+ "and $f18, $f18, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-464(%[tmp]) \n\t"
+ "paddh $f20, $f8, $f8 \n\t"
+ "paddh $f22, $f10, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-432(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-448(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f28 \n\t"
+ "paddh $f10, $f10, $f30 \n\t"
+ "dli %[iAlpha], 0x2 \n\t"
+ "paddh $f20, $f20, $f8 \n\t"
+ "paddh $f22, $f22, $f10 \n\t"
+ "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f28 \n\t"
+ "paddh $f22, $f22, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "gslqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
+ "psrah $f20, $f20, $f28 \n\t"
+ "psrah $f22, $f22, $f28 \n\t"
+ "pandn $f12, $f12, $f20 \n\t"
+ "pandn $f14, $f14, $f22 \n\t"
+ "or $f16, $f16, $f12 \n\t"
+ "or $f18, $f18, $f14 \n\t"
+ "gslqc1 $f14, $f12, 640-32(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-304(%[tmp]) \n\t"
+ "or $f12, $f12, $f28 \n\t"
+ "or $f14, $f14, $f30 \n\t"
+ "and $f28, $f4, $f16 \n\t"
+ "and $f30, $f6, $f18 \n\t"
+ "gslqc1 $f18, $f16, 640-432(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 640-464(%[tmp]) \n\t"
+ "pandn $f8, $f4, $f16 \n\t"
+ "pandn $f10, $f6, $f18 \n\t"
+ "or $f28, $f28, $f8 \n\t"
+ "or $f30, $f30, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t"
+ "paddh $f16, $f16, $f8 \n\t"
+ "paddh $f18, $f18, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-288(%[tmp]) \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f28, $f30 \n\t"
+ "dli %[iAlpha], 0x2 \n\t"
+ "gssqc1 $f10, $f8, 704-272(%[tmp]) \n\t"
+
+ "and $f8, $f0, $f12 \n\t"
+ "and $f10, $f2, $f14 \n\t"
+ "gslqc1 $f30, $f28, 640-384(%[tmp]) \n\t"
+ "pandn $f12, $f0, $f28 \n\t"
+ "pandn $f14, $f2, $f30 \n\t"
+ "or $f8, $f8, $f12 \n\t"
+ "or $f10, $f10, $f14 \n\t"
+ "gssqc1 $f10, $f8, 640-304(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-464(%[tmp]) \n\t"
+ "paddh $f12, $f8, $f28 \n\t"
+ "paddh $f14, $f10, $f30 \n\t"
+ "paddh $f12, $f12, $f16 \n\t"
+ "paddh $f14, $f14, $f18 \n\t"
+ "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f28 \n\t"
+ "paddh $f14, $f14, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "psrah $f12, $f12, $f28 \n\t"
+ "psrah $f14, $f14, $f28 \n\t"
+ "and $f24, $f24, $f12 \n\t"
+ "and $f26, $f26, $f14 \n\t"
+ "gslqc1 $f14, $f12, 640-560(%[tmp]) \n\t"
+ "pandn $f16, $f12, $f20 \n\t"
+ "pandn $f18, $f14, $f22 \n\t"
+ "or $f24, $f24, $f16 \n\t"
+ "or $f26, $f26, $f18 \n\t"
+ "and $f28, $f4, $f24 \n\t"
+ "and $f30, $f6, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-304(%[tmp]) \n\t"
+ "pandn $f16, $f4, $f20 \n\t"
+ "pandn $f18, $f6, $f22 \n\t"
+ "or $f28, $f28, $f16 \n\t"
+ "or $f30, $f30, $f18 \n\t"
+ "dli %[iAlpha], 0x1 \n\t"
+
+ "packushb $f24, $f24, $f26 \n\t"
+ "packushb $f26, $f28, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-112(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 640-80(%[tmp]) \n\t"
+ "or $f28, $f28, $f16 \n\t"
+ "or $f30, $f30, $f18 \n\t"
+ "and $f16, $f0, $f28 \n\t"
+ "and $f18, $f2, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
+ "pandn $f0, $f0, $f28 \n\t"
+ "pandn $f2, $f2, $f30 \n\t"
+ "or $f16, $f16, $f0 \n\t"
+ "or $f18, $f18, $f2 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+ "gslqc1 $f2, $f0, 0x0($12) \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "punpcklbh $f0, $f2, $f30 \n\t"
+ "punpckhbh $f2, $f2, $f30 \n\t"
+ "psllh $f0, $f0, $f28 \n\t"
+ "psllh $f2, $f2, $f28 \n\t"
+ "paddh $f0, $f0, $f8 \n\t"
+ "paddh $f2, $f2, $f10 \n\t"
+ "paddh $f0, $f0, $f8 \n\t"
+ "paddh $f2, $f2, $f10 \n\t"
+ "paddh $f0, $f0, $f8 \n\t"
+ "paddh $f2, $f2, $f10 \n\t"
+ "paddh $f0, $f0, $f20 \n\t"
+ "paddh $f2, $f2, $f22 \n\t"
+ "dli %[iAlpha], 0x3 \n\t"
+ "gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t"
+ "paddh $f0, $f0, $f28 \n\t"
+ "paddh $f2, $f2, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
+ "paddh $f0, $f0, $f28 \n\t"
+ "paddh $f2, $f2, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
+ "paddh $f0, $f0, $f28 \n\t"
+ "paddh $f2, $f2, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "psrah $f0, $f0, $f28 \n\t"
+ "psrah $f2, $f2, $f28 \n\t"
+ "and $f0, $f0, $f12 \n\t"
+ "and $f2, $f2, $f14 \n\t"
+ "pandn $f12, $f12, $f8 \n\t"
+ "pandn $f14, $f14, $f10 \n\t"
+ "or $f0, $f0, $f12 \n\t"
+ "or $f2, $f2, $f14 \n\t"
+ "and $f28, $f4, $f0 \n\t"
+ "and $f30, $f6, $f2 \n\t"
+
+ "gslqc1 $f2, $f0, 656-272(%[tmp]) \n\t"
+ "gssqc1 $f2, $f0, 0x0($11) \n\t"
+
+ "gslqc1 $f2, $f0, 672-272(%[tmp]) \n\t"
+
+ "gssqc1 $f2, $f0, 0x0($8) \n\t"
+ "gslqc1 $f2, $f0, 688-272(%[tmp]) \n\t"
+ "gssqc1 $f2, $f0, 0x0($9) \n\t"
+ "gslqc1 $f2, $f0, 704-272(%[tmp]) \n\t"
+
+ "pandn $f4, $f4, $f8 \n\t"
+ "pandn $f6, $f6, $f10 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPix]) \n\t"
+ "or $f28, $f28, $f4 \n\t"
+ "or $f30, $f30, $f6 \n\t"
+ "packushb $f16, $f16, $f18 \n\t"
+ "packushb $f18, $f28, $f30 \n\t"
+ "gssqc1 $f26, $f24, 0x0($13) \n\t"
+ "gssqc1 $f18, $f16, 0x0(%[iStride]) \n\t"
+ : [pPix]"+&r"((unsigned char *)pPix)
+ : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
+ [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0",
+ "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+ "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockChromaLt4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
+ int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
+ unsigned char tmp[256] __attribute__((aligned(32)));
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "lb $8, 0x2(%[pTC]) \n\t"
+ "lb $9, 0x3(%[pTC]) \n\t"
+ "move $11, $8 \n\t"
+ "lb $8, 0x1(%[pTC]) \n\t"
+ "lb %[pTC], 0x0(%[pTC]) \n\t"
+ "move $12, %[pTC] \n\t"
+ "and %[pTC], $9, 0xFFFF \n\t"
+ "dmtc1 %[pTC], $f4 \n\t"
+ "and %[pTC], $9, 0xFFFF \n\t"
+ "dmtc1 %[pTC], $f8 \n\t"
+ "move %[pTC], $11 \n\t"
+ "and $9, %[pTC], 0xFFFF \n\t"
+ "and %[pTC], %[pTC], 0xFFFF \n\t"
+ "dmtc1 %[pTC], $f16 \n\t"
+ "and %[pTC], $8, 0xFFFF \n\t"
+ "dmtc1 %[pTC], $f20 \n\t"
+ "dmtc1 $9, $f12 \n\t"
+ "and %[pTC], $8, 0xFFFF \n\t"
+ "dmtc1 %[pTC], $f24 \n\t"
+ "move %[pTC], $12 \n\t"
+ "and $9, %[pTC], 0xFFFF \n\t"
+ "and %[pTC], %[pTC], 0xFFFF \n\t"
+ "punpcklhw $f24, $f24, $f8 \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+ "xor $f2, $f2, $f2 \n\t"
+ "gssqc1 $f2, $f0, 0x40(%[tmp]) \n\t"
+ "dmtc1 $9, $f28 \n\t"
+ "dmtc1 %[pTC], $f0 \n\t"
+ "daddu %[pTC], %[iStride], %[iStride] \n\t"
+ "dsubu $9, %[pPixCb], %[pTC] \n\t"
+ "punpcklhw $f20, $f20, $f4 \n\t"
+ "gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
+ "punpcklhw $f0, $f0, $f16 \n\t"
+ "gsldxc1 $f16, 0x0(%[iStride], %[pPixCr]) \n\t"
+ "punpcklhw $f28, $f28, $f12 \n\t"
+ "gsldxc1 $f12, 0x0(%[pPixCb], $0) \n\t"
+ "punpcklhw $f0, $f0, $f24 \n\t"
+ "gsldxc1 $f24, 0x0($9, $0) \n\t"
+ "punpcklhw $f28, $f28, $f20 \n\t"
+ "punpckhhw $f2, $f0, $f28 \n\t"
+ "punpcklhw $f0, $f0, $f28 \n\t"
+ "dsubu $9, %[pPixCr], %[pTC] \n\t"
+ "psubh $f8, $f4, $f0 \n\t"
+ "psubh $f10, $f6, $f2 \n\t"
+ "gssqc1 $f10, $f8, 0x60(%[tmp]) \n\t"
+ "gsldxc1 $f8, 0x0($9, $0) \n\t"
+ "mov.d $f26, $f8 \n\t"
+ "dsubu %[pTC], %[pPixCb], %[iStride] \n\t"
+ "gsldxc1 $f28, 0x0(%[pTC], $0) \n\t"
+ "dsubu $9, %[pPixCr], %[iStride] \n\t"
+ "gsldxc1 $f8, 0x0($9, $0) \n\t"
+ "mov.d $f30, $f8 \n\t"
+ "gsldxc1 $f8, 0x0(%[pPixCr], $0) \n\t"
+ "mov.d $f14, $f8 \n\t"
+ "gsldxc1 $f8, 0x0(%[iStride], %[pPixCb]) \n\t"
+ "mov.d $f10, $f16 \n\t"
+ "gssqc1 $f10, $f8, 0xE0(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f8 \n\t"
+ "punpcklhw $f16, $f8, $f8 \n\t"
+ "dmtc1 %[iBeta], $f8 \n\t"
+ "punpcklhw $f20, $f8, $f8 \n\t"
+ "punpcklwd $f8, $f20, $f20 \n\t"
+ "mov.d $f10, $f8 \n\t"
+ "gssqc1 $f10, $f8, 0x50(%[tmp]) \n\t"
+ "punpckhbh $f10, $f24, $f4 \n\t"
+ "punpcklbh $f8, $f24, $f4 \n\t"
+ "gssqc1 $f14, $f12, 0xd0(%[tmp]) \n\t"
+ "punpcklwd $f16, $f16, $f16 \n\t"
+ "mov.d $f18, $f16 \n\t"
+ "gssqc1 $f10, $f8, 0x30(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+ "gssqc1 $f26, $f24, 0x80(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0xd0(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+ "gssqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0xe0(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+ "gssqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 0xe0(%[tmp]) \n\t"
+ "mov.d $f8, $f28 \n\t"
+ "mov.d $f10, $f30 \n\t"
+ "punpcklbh $f28, $f30, $f6 \n\t"
+ "punpckhbh $f30, $f30, $f6 \n\t"
+ "punpckhbh $f22, $f20, $f4 \n\t"
+ "punpcklbh $f20, $f20, $f4 \n\t"
+ "gssqc1 $f30, $f28, 0xa0(%[tmp]) \n\t"
+ "punpckhbh $f14, $f12, $f4 \n\t"
+ "punpcklbh $f12, $f12, $f4 \n\t"
+ "dli %[iBeta], 0x4 \n\t"
+ "punpckhbh $f10, $f8, $f4 \n\t"
+ "punpcklbh $f8, $f8, $f4 \n\t"
+ "dmtc1 %[iBeta], $f24 \n\t"
+ "punpcklhw $f28, $f24, $f24 \n\t"
+ "punpcklwd $f24, $f28, $f28 \n\t"
+ "mov.d $f26, $f24 \n\t"
+ "gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
+ "psubh $f28, $f28, $f20 \n\t"
+ "psubh $f30, $f30, $f22 \n\t"
+ "pcmpgth $f24, $f0, $f4 \n\t"
+ "pcmpgth $f26, $f2, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x60(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 0x40(%[tmp]) \n\t"
+ "psubh $f24, $f12, $f8 \n\t"
+ "psubh $f26, $f14, $f10 \n\t"
+ "dmfc1 %[iAlpha], $f12 \n\t"
+ "dmfc1 %[iBeta], $f14 \n\t"
+ "dli $10, 0x2 \n\t"
+ "dmtc1 $10, $f12 \n\t"
+ "dli $10, 0x3 \n\t"
+ "dmtc1 $10, $f14 \n\t"
+ "psllh $f24, $f24, $f12 \n\t"
+ "psllh $f26, $f26, $f12 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x20(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "psrah $f24, $f24, $f14 \n\t"
+ "psrah $f26, $f26, $f14 \n\t"
+ "dmtc1 %[iAlpha], $f12 \n\t"
+ "dmtc1 %[iBeta], $f14 \n\t"
+ "pmaxsh $f4, $f4, $f24 \n\t"
+ "pmaxsh $f6, $f6, $f26 \n\t"
+ "gssqc1 $f2, $f0, 0x10(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x10(%[tmp]) \n\t"
+ "pminsh $f24, $f24, $f4 \n\t"
+ "pminsh $f26, $f26, $f6 \n\t"
+ "gssqc1 $f26, $f24, 0x10(%[tmp]) \n\t"
+ "psubh $f4, $f8, $f12 \n\t"
+ "psubh $f6, $f10, $f14 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f24, $f26)
+ "pcmpgth $f24, $f16, $f4 \n\t"
+ "pcmpgth $f26, $f18, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x30(%[tmp]) \n\t"
+ "psubh $f4, $f4, $f8 \n\t"
+ "psubh $f6, $f6, $f10 \n\t"
+ "dmfc1 %[iAlpha], $f8 \n\t"
+ "dmfc1 %[iBeta], $f10 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f8, $f10)
+ "pcmpgth $f28, $f28, $f4 \n\t"
+ "pcmpgth $f30, $f30, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x50(%[tmp]) \n\t"
+ "and $f24, $f24, $f28 \n\t"
+ "and $f26, $f26, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "psubh $f20, $f20, $f12 \n\t"
+ "psubh $f22, $f22, $f14 \n\t"
+ WELS_AbsH($f20, $f22, $f20, $f22, $f8, $f10)
+ "pcmpgth $f4, $f4, $f20 \n\t"
+ "pcmpgth $f6, $f6, $f22 \n\t"
+ "gslqc1 $f22, $f20, 0x80(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 0x90(%[tmp]) \n\t"
+ "psubh $f20, $f20, $f8 \n\t"
+ "psubh $f22, $f22, $f10 \n\t"
+ "and $f24, $f24, $f4 \n\t"
+ "and $f26, $f26, $f6 \n\t"
+ "gslqc1 $f10, $f8, 0x40(%[tmp]) \n\t"
+ "and $f24, $f24, $f8 \n\t"
+ "and $f26, $f26, $f10 \n\t"
+ "gslqc1 $f6, $f4, 0x10(%[tmp]) \n\t"
+ "and $f4, $f4, $f24 \n\t"
+ "and $f6, $f6, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
+ "gssqc1 $f6, $f4, 0x30(%[tmp]) \n\t"
+ "gslqc1 $f6, $f4, 0xa0(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f4 \n\t"
+ "psubh $f26, $f26, $f6 \n\t"
+ "dli $10, 0x2 \n\t"
+ "dmtc1 $10, $f8 \n\t"
+ "psllh $f24, $f24, $f8 \n\t"
+ "psllh $f26, $f26, $f8 \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "dli $10, 0x3 \n\t"
+ "gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+ "dmtc1 $10, $f8 \n\t"
+ "gslqc1 $f22, $f20, 0x60(%[tmp]) \n\t"
+ "psrah $f24, $f24, $f8 \n\t"
+ "psrah $f26, $f26, $f8 \n\t"
+ "pmaxsh $f20, $f20, $f24 \n\t"
+ "pmaxsh $f22, $f22, $f26 \n\t"
+ "pminsh $f0, $f0, $f20 \n\t"
+ "pminsh $f2, $f2, $f22 \n\t"
+ "gslqc1 $f22, $f20, 0x70(%[tmp]) \n\t"
+ "psubh $f24, $f4, $f20 \n\t"
+ "psubh $f26, $f6, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
+ "pcmpgth $f16, $f16, $f24 \n\t"
+ "pcmpgth $f18, $f18, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x80(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f4 \n\t"
+ "psubh $f26, $f26, $f6 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
+ "pcmpgth $f28, $f28, $f24 \n\t"
+ "pcmpgth $f30, $f30, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
+ "and $f16, $f16, $f28 \n\t"
+ "and $f18, $f18, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f20 \n\t"
+ "psubh $f26, $f26, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
+ "dmtc1 %[iAlpha], $f8 \n\t"
+ "dmtc1 %[iBeta], $f10 \n\t"
+ "pcmpgth $f28, $f28, $f24 \n\t"
+ "pcmpgth $f30, $f30, $f26 \n\t"
+ "and $f16, $f16, $f28 \n\t"
+ "and $f18, $f18, $f30 \n\t"
+ "gslqc1 $f26, $f24, 0x40(%[tmp]) \n\t"
+ "and $f16, $f16, $f24 \n\t"
+ "and $f18, $f18, $f26 \n\t"
+ "and $f0, $f0, $f16 \n\t"
+ "and $f2, $f2, $f18 \n\t"
+ "gslqc1 $f18, $f16, 0x30(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f16 \n\t"
+ "paddh $f10, $f10, $f18 \n\t"
+ "paddh $f4, $f4, $f0 \n\t"
+ "paddh $f6, $f6, $f2 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f4, $f6 \n\t"
+ "gssdxc1 $f8, 0x0(%[pTC], $0) \n\t"
+ "psubh $f12, $f12, $f16 \n\t"
+ "psubh $f14, $f14, $f18 \n\t"
+ "psubh $f20, $f20, $f0 \n\t"
+ "psubh $f22, $f22, $f2 \n\t"
+ "packushb $f12, $f12, $f14 \n\t"
+ "packushb $f14, $f20, $f22 \n\t"
+ "gssdxc1 $f12, 0x0(%[pPixCb], $0) \n\t"
+ "gssdxc1 $f10, 0x0($9, $0) \n\t"
+ "gssdxc1 $f14, 0x0(%[pPixCr], $0) \n\t"
+ : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
+ : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
+ [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
+ "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
+ "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockChromaEq4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
+ int32_t iAlpha, int32_t iBeta) {
+ unsigned char tmp[128] __attribute__((aligned(32)));
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "daddu $8, %[iStride], %[iStride] \n\t"
+ "dsubu $9, %[pPixCb], $8 \n\t"
+ "gsldxc1 $f16, 0x0(%[pPixCr], $0) \n\t"
+ "gsldxc1 $f20, 0x0(%[iStride], %[pPixCr]) \n\t"
+ "gsldxc1 $f4, 0x0($9, $0) \n\t"
+ "dsubu $9, %[pPixCr], $8 \n\t"
+ "gsldxc1 $f8, 0x0($9, $0) \n\t"
+ "mov.d $f6, $f8 \n\t"
+ "dsubu $8, %[pPixCb], %[iStride] \n\t"
+ "gsldxc1 $f8, 0x0($8, $0) \n\t"
+ "dsubu $9, %[pPixCr], %[iStride] \n\t"
+ "gsldxc1 $f12, 0x0($9, $0) \n\t"
+ "mov.d $f10, $f12 \n\t"
+ "gsldxc1 $f12, 0x0(%[pPixCb], $0) \n\t"
+ "mov.d $f14, $f16 \n\t"
+ "gsldxc1 $f16, 0x0(%[iStride], %[pPixCb]) \n\t"
+ "mov.d $f18, $f20 \n\t"
+ "dmtc1 %[iAlpha], $f20 \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+ "xor $f2, $f2, $f2 \n\t"
+ "punpcklhw $f24, $f20, $f20 \n\t"
+ "punpcklwd $f20, $f24, $f24 \n\t"
+ "mov.d $f22, $f20 \n\t"
+ "dmtc1 %[iBeta], $f24 \n\t"
+ "punpcklhw $f28, $f24, $f24 \n\t"
+ "punpcklwd $f24, $f28, $f28 \n\t"
+ "mov.d $f26, $f24 \n\t"
+ "mov.d $f28, $f4 \n\t"
+ "punpcklbh $f4, $f6, $f2 \n\t"
+ "punpckhbh $f6, $f6, $f2 \n\t"
+ "punpckhbh $f30, $f28, $f0 \n\t"
+ "punpcklbh $f28, $f28, $f0 \n\t"
+ "gssqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
+ "gssqc1 $f30, $f28, 0x60(%[tmp]) \n\t"
+ "punpckhbh $f30, $f8, $f0 \n\t"
+ "punpcklbh $f28, $f8, $f0 \n\t"
+ "gssqc1 $f30, $f28, 0x10(%[tmp]) \n\t"
+ "punpckhbh $f30, $f12, $f0 \n\t"
+ "punpcklbh $f28, $f12, $f0 \n\t"
+ "punpcklbh $f12, $f14, $f2 \n\t"
+ "punpckhbh $f14, $f14, $f2 \n\t"
+ "gssqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "mov.d $f28, $f16 \n\t"
+ "punpcklbh $f16, $f18, $f2 \n\t"
+ "punpckhbh $f18, $f18, $f2 \n\t"
+ "punpcklbh $f8, $f10, $f2 \n\t"
+ "punpckhbh $f10, $f10, $f2 \n\t"
+ "punpckhbh $f30, $f28, $f0 \n\t"
+ "punpcklbh $f28, $f28, $f0 \n\t"
+ "gssqc1 $f14, $f12, 0x30(%[tmp]) \n\t"
+ "gslqc1 $f14, $f12, 0x10(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 0x50(%[tmp]) \n\t"
+ "psubh $f4, $f12, $f0 \n\t"
+ "psubh $f6, $f14, $f2 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
+ "gssqc1 $f18, $f16, 0x20(%[tmp]) \n\t"
+ "pcmpgth $f0, $f20, $f4 \n\t"
+ "pcmpgth $f2, $f22, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x60(%[tmp]) \n\t"
+ "psubh $f4, $f4, $f12 \n\t"
+ "psubh $f6, $f6, $f14 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
+ "pcmpgth $f16, $f24, $f4 \n\t"
+ "pcmpgth $f18, $f26, $f6 \n\t"
+ "and $f0, $f0, $f16 \n\t"
+ "and $f2, $f2, $f18 \n\t"
+ "gslqc1 $f18, $f16, 0x50(%[tmp]) \n\t"
+ "psubh $f4, $f28, $f16 \n\t"
+ "psubh $f6, $f30, $f18 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
+ "pcmpgth $f16, $f24, $f4 \n\t"
+ "pcmpgth $f18, $f26, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x30(%[tmp]) \n\t"
+ "psubh $f4, $f8, $f4 \n\t"
+ "psubh $f6, $f10, $f6 \n\t"
+ "dmfc1 %[iAlpha], $f28 \n\t"
+ "dmfc1 %[iBeta], $f30 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
+ "pcmpgth $f20, $f20, $f4 \n\t"
+ "pcmpgth $f22, $f22, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
+ "and $f0, $f0, $f16 \n\t"
+ "and $f2, $f2, $f18 \n\t"
+ "psubh $f4, $f4, $f8 \n\t"
+ "psubh $f6, $f6, $f10 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
+ "pcmpgth $f16, $f24, $f4 \n\t"
+ "pcmpgth $f18, $f26, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x20(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t"
+ "psubh $f4, $f4, $f28 \n\t"
+ "psubh $f6, $f6, $f30 \n\t"
+ "and $f20, $f20, $f16 \n\t"
+ "and $f22, $f22, $f18 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "dmtc1 %[iBeta], $f30 \n\t"
+ "pcmpgth $f24, $f24, $f4 \n\t"
+ "pcmpgth $f26, $f26, $f6 \n\t"
+ "and $f20, $f20, $f24 \n\t"
+ "and $f22, $f22, $f26 \n\t"
+ "dli %[iBeta], 0x2 \n\t"
+ "dmtc1 %[iBeta], $f4 \n\t"
+ "punpcklhw $f16, $f4, $f4 \n\t"
+ "punpcklwd $f4, $f16, $f16 \n\t"
+ "mov.d $f6, $f4 \n\t"
+ "gslqc1 $f18, $f16, 0x60(%[tmp]) \n\t"
+ "paddh $f24, $f16, $f16 \n\t"
+ "paddh $f26, $f18, $f18 \n\t"
+ "paddh $f24, $f24, $f12 \n\t"
+ "paddh $f26, $f26, $f14 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "gssqc1 $f6, $f4, 0x10(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 0x10(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f16 \n\t"
+ "paddh $f26, $f26, $f18 \n\t"
+ "dmtc1 %[iBeta], $f16 \n\t"
+ "psrah $f24, $f24, $f16 \n\t"
+ "psrah $f26, $f26, $f16 \n\t"
+ "pandn $f16, $f0, $f12 \n\t"
+ "pandn $f18, $f2, $f14 \n\t"
+ "gslqc1 $f14, $f12, 0x40(%[tmp]) \n\t"
+ "and $f4, $f0, $f24 \n\t"
+ "and $f6, $f2, $f26 \n\t"
+ "or $f4, $f4, $f16 \n\t"
+ "or $f6, $f6, $f18 \n\t"
+ "paddh $f24, $f12, $f12 \n\t"
+ "paddh $f26, $f14, $f14 \n\t"
+ "gslqc1 $f14, $f12, 0x10(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+ "gslqc1 $f18, $f16, 0x20(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f16 \n\t"
+ "paddh $f26, $f26, $f18 \n\t"
+ "dmtc1 %[iBeta], $f16 \n\t"
+ "paddh $f24, $f24, $f12 \n\t"
+ "paddh $f26, $f26, $f14 \n\t"
+ "psrah $f24, $f24, $f16 \n\t"
+ "psrah $f26, $f26, $f16 \n\t"
+ "and $f16, $f20, $f24 \n\t"
+ "and $f18, $f22, $f26 \n\t"
+ "pandn $f24, $f20, $f8 \n\t"
+ "pandn $f26, $f22, $f10 \n\t"
+ "or $f16, $f16, $f24 \n\t"
+ "or $f18, $f18, $f26 \n\t"
+ "packushb $f4, $f4, $f6 \n\t"
+ "packushb $f6, $f16, $f18 \n\t"
+ "gslqc1 $f18, $f16, 0x50(%[tmp]) \n\t"
+ "paddh $f24, $f28, $f28 \n\t"
+ "paddh $f26, $f30, $f30 \n\t"
+ "paddh $f24, $f24, $f16 \n\t"
+ "paddh $f26, $f26, $f18 \n\t"
+ "gslqc1 $f10, $f8, 0x60(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+ "dmtc1 %[iBeta], $f28 \n\t"
+ "paddh $f24, $f24, $f12 \n\t"
+ "paddh $f26, $f26, $f14 \n\t"
+ "psrah $f24, $f24, $f28 \n\t"
+ "psrah $f26, $f26, $f28 \n\t"
+ "and $f8, $f0, $f24 \n\t"
+ "and $f10, $f2, $f26 \n\t"
+ "pandn $f0, $f0, $f16 \n\t"
+ "pandn $f2, $f2, $f18 \n\t"
+ "or $f8, $f8, $f0 \n\t"
+ "or $f10, $f10, $f2 \n\t"
+ "gslqc1 $f2, $f0, 0x20(%[tmp]) \n\t"
+ "paddh $f24, $f0, $f0 \n\t"
+ "paddh $f26, $f2, $f2 \n\t"
+ "gslqc1 $f2, $f0, 0x30(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f0 \n\t"
+ "paddh $f26, $f26, $f2 \n\t"
+ "gslqc1 $f18, $f16, 0x40(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f16 \n\t"
+ "paddh $f26, $f26, $f18 \n\t"
+ "paddh $f24, $f24, $f12 \n\t"
+ "paddh $f26, $f26, $f14 \n\t"
+ "gssdxc1 $f4, 0x0($8, $0) \n\t"
+ "psrah $f24, $f24, $f28 \n\t"
+ "psrah $f26, $f26, $f28 \n\t"
+ "and $f16, $f20, $f24 \n\t"
+ "and $f18, $f22, $f26 \n\t"
+ "pandn $f20, $f20, $f0 \n\t"
+ "pandn $f22, $f22, $f2 \n\t"
+ "or $f16, $f16, $f20 \n\t"
+ "or $f18, $f18, $f22 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f16, $f18 \n\t"
+ "gssdxc1 $f8, 0x0(%[pPixCb], $0) \n\t"
+ "gssdxc1 $f6, 0x0($9, $0) \n\t"
+ "gssdxc1 $f10, 0x0(%[pPixCr], $0) \n\t"
+ : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
+ : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
+ [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
+ "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
+ "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockChromaEq4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
+ int32_t iAlpha, int32_t iBeta) {
+ unsigned char tmp[256] __attribute__((aligned(32)));
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "daddiu %[pPixCb], %[pPixCb], -0x2 \n\t"
+ "daddiu %[pPixCr], %[pPixCr], -0x2 \n\t"
+ "move $9, %[pPixCb] \n\t"
+ "move $10, %[pPixCr] \n\t"
+ "dsll $11, %[iStride], 0x2 \n\t"
+ "daddu %[pPixCb], %[pPixCb], $11 \n\t"
+ "daddu %[pPixCr], %[pPixCr], $11 \n\t"
+ "daddiu $11, %[tmp], 0x80 \n\t"
+ "gsldlc1 $f0, 0x7($9) \n\t"
+ "gsldrc1 $f0, 0x0($9) \n\t"
+ "daddu $12, $9, %[iStride] \n\t"
+ "gsldlc1 $f4, 0x7($12) \n\t"
+ "gsldrc1 $f4, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsldlc1 $f8, 0x7($12) \n\t"
+ "gsldrc1 $f8, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsldlc1 $f12, 0x7($12) \n\t"
+ "gsldlc1 $f16, 0x7($10) \n\t"
+ "gsldrc1 $f12, 0x0($12) \n\t"
+ "gsldrc1 $f16, 0x0($10) \n\t"
+ "daddu $12, $10, %[iStride] \n\t"
+ "gsldlc1 $f20, 0x7($12) \n\t"
+ "gsldrc1 $f20, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsldlc1 $f24, 0x7($12) \n\t"
+ "gsldrc1 $f24, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsldlc1 $f28, 0x7($12) \n\t"
+ "gsldrc1 $f28, 0x0($12) \n\t"
+ "punpcklwd $f0, $f0, $f16 \n\t"
+ "punpcklwd $f4, $f4, $f20 \n\t"
+ "punpcklwd $f8, $f8, $f24 \n\t"
+ "punpcklwd $f12, $f12, $f28 \n\t"
+ "gsldlc1 $f16, 0x7(%[pPixCb]) \n\t"
+ "gsldlc1 $f20, 0x7(%[pPixCr]) \n\t"
+ "gsldrc1 $f16, 0x0(%[pPixCb]) \n\t"
+ "gsldrc1 $f20, 0x0(%[pPixCr]) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f2, $f16 \n\t"
+ "daddu $12, %[pPixCb], %[iStride] \n\t"
+ "daddu $13, %[pPixCr], %[iStride] \n\t"
+ "gsldlc1 $f16, 0x7($12) \n\t"
+ "gsldlc1 $f20, 0x7($13) \n\t"
+ "gsldrc1 $f16, 0x0($12) \n\t"
+ "gsldrc1 $f20, 0x0($13) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f6, $f16 \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "daddu $13, $13, %[iStride] \n\t"
+ "gsldlc1 $f16, 0x7($12) \n\t"
+ "gsldlc1 $f20, 0x7($13) \n\t"
+ "gsldrc1 $f16, 0x0($12) \n\t"
+ "gsldrc1 $f20, 0x0($13) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f10, $f16 \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "daddu $13, $13, %[iStride] \n\t"
+ "gsldlc1 $f16, 0x7($12) \n\t"
+ "gsldlc1 $f20, 0x7($13) \n\t"
+ "gsldrc1 $f16, 0x0($12) \n\t"
+ "gsldrc1 $f20, 0x0($13) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f14, $f16 \n\t"
+ "punpcklbh $f24, $f2, $f6 \n\t"
+ "punpckhbh $f26, $f2, $f6 \n\t"
+ "punpckhbh $f2, $f0, $f4 \n\t"
+ "punpcklbh $f0, $f0, $f4 \n\t"
+ "punpcklbh $f28, $f10, $f14 \n\t"
+ "punpckhbh $f30, $f10, $f14 \n\t"
+ "punpckhbh $f10, $f8, $f12 \n\t"
+ "punpcklbh $f8, $f8, $f12 \n\t"
+ "punpcklhw $f16, $f2, $f10 \n\t"
+ "punpckhhw $f18, $f2, $f10 \n\t"
+ "punpckhhw $f2, $f0, $f8 \n\t"
+ "punpcklhw $f0, $f0, $f8 \n\t"
+ "punpcklhw $f20, $f26, $f30 \n\t"
+ "punpckhhw $f22, $f26, $f30 \n\t"
+ "punpckhhw $f26, $f24, $f28 \n\t"
+ "punpcklhw $f24, $f24, $f28 \n\t"
+ "punpcklwd $f4, $f2, $f26 \n\t"
+ "punpckhwd $f6, $f2, $f26 \n\t"
+ "punpckhwd $f2, $f0, $f24 \n\t"
+ "punpcklwd $f0, $f0, $f24 \n\t"
+ "punpcklwd $f8, $f18, $f22 \n\t"
+ "punpckhwd $f10, $f18, $f22 \n\t"
+ "punpckhwd $f18, $f16, $f20 \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f20, $f2 \n\t"
+ "mov.d $f22, $f18 \n\t"
+ "mov.d $f2, $f16 \n\t"
+ "mov.d $f24, $f6 \n\t"
+ "mov.d $f26, $f10 \n\t"
+ "mov.d $f6, $f8 \n\t"
+ "gssqc1 $f2, $f0, 0x0($11) \n\t"
+ "gssqc1 $f22, $f20, 0x10($11) \n\t"
+ "gssqc1 $f6, $f4, 0x20($11) \n\t"
+ "gssqc1 $f26, $f24, 0x30($11) \n\t"
+ "gslqc1 $f26, $f24, 0x80(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 0x90(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 0xa0(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 0xb0(%[tmp]) \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+ "dmtc1 %[iAlpha], $f4 \n\t"
+ "punpcklhw $f8, $f4, $f4 \n\t"
+ "punpcklwd $f4, $f8, $f8 \n\t"
+ "mov.d $f6, $f4 \n\t"
+ "dmtc1 %[iBeta], $f8 \n\t"
+ "punpcklhw $f12, $f8, $f8 \n\t"
+ "punpcklwd $f8, $f12, $f12 \n\t"
+ "mov.d $f10, $f8 \n\t"
+ "mov.d $f12, $f24 \n\t"
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "gssqc1 $f26, $f24, 0x60(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "gssqc1 $f26, $f24, 0x30(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0xa0(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "gssqc1 $f26, $f24, 0x40(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0xb0(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "gssqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
+ "punpckhbh $f30, $f28, $f0 \n\t"
+ "punpcklbh $f28, $f28, $f0 \n\t"
+ "punpckhbh $f18, $f16, $f0 \n\t"
+ "punpcklbh $f16, $f16, $f0 \n\t"
+ "punpckhbh $f22, $f20, $f0 \n\t"
+ "punpcklbh $f20, $f20, $f0 \n\t"
+ "punpckhbh $f14, $f12, $f0 \n\t"
+ "punpcklbh $f12, $f12, $f0 \n\t"
+ "gssqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "psubh $f24, $f16, $f20 \n\t"
+ "psubh $f26, $f18, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
+ "pcmpgth $f0, $f4, $f24 \n\t"
+ "pcmpgth $f2, $f6, $f26 \n\t"
+ "psubh $f24, $f12, $f16 \n\t"
+ "psubh $f26, $f14, $f18 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+ "pcmpgth $f28, $f8, $f24 \n\t"
+ "pcmpgth $f30, $f10, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x50(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f20 \n\t"
+ "psubh $f26, $f26, $f22 \n\t"
+ "and $f0, $f0, $f28 \n\t"
+ "and $f2, $f2, $f30 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+ "dmfc1 %[iAlpha], $f20 \n\t"
+ "dmfc1 %[iBeta], $f22 \n\t"
+ "pcmpgth $f28, $f8, $f24 \n\t"
+ "pcmpgth $f30, $f10, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x30(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 0x40(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f20 \n\t"
+ "psubh $f26, $f26, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
+ "pcmpgth $f4, $f4, $f24 \n\t"
+ "pcmpgth $f6, $f6, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x60(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 0x30(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f20 \n\t"
+ "psubh $f26, $f26, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
+ "and $f0, $f0, $f28 \n\t"
+ "and $f2, $f2, $f30 \n\t"
+ "pcmpgth $f28, $f8, $f24 \n\t"
+ "pcmpgth $f30, $f10, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 0x40(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f20 \n\t"
+ "psubh $f26, $f26, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
+ "dli $8, 0x2 \n\t"
+ "and $f4, $f4, $f28 \n\t"
+ "and $f6, $f6, $f30 \n\t"
+ "pcmpgth $f8, $f8, $f24 \n\t"
+ "pcmpgth $f10, $f10, $f26 \n\t"
+ "and $f4, $f4, $f8 \n\t"
+ "and $f6, $f6, $f10 \n\t"
+ "dmtc1 $8, $f8 \n\t"
+ "punpcklhw $f24, $f8, $f8 \n\t"
+ "punpcklwd $f8, $f24, $f24 \n\t"
+ "mov.d $f10, $f8 \n\t"
+ "gssqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
+ "paddh $f8, $f12, $f12 \n\t"
+ "paddh $f10, $f14, $f14 \n\t"
+ "paddh $f8, $f8, $f16 \n\t"
+ "paddh $f10, $f10, $f18 \n\t"
+ "gslqc1 $f22, $f20, 0x50(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f20 \n\t"
+ "paddh $f10, $f10, $f22 \n\t"
+ "gslqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f24 \n\t"
+ "paddh $f10, $f10, $f26 \n\t"
+ "dmtc1 $8, $f20 \n\t"
+ "psrah $f8, $f8, $f20 \n\t"
+ "psrah $f10, $f10, $f20 \n\t"
+ "and $f24, $f0, $f8 \n\t"
+ "and $f26, $f2, $f10 \n\t"
+ "pandn $f8, $f0, $f16 \n\t"
+ "pandn $f10, $f2, $f18 \n\t"
+ "or $f24, $f24, $f8 \n\t"
+ "or $f26, $f26, $f10 \n\t"
+ "gslqc1 $f10, $f8, 0x60(%[tmp]) \n\t"
+ "paddh $f28, $f8, $f8 \n\t"
+ "paddh $f30, $f10, $f10 \n\t"
+ "gslqc1 $f22, $f20, 0x30(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f20 \n\t"
+ "paddh $f30, $f30, $f22 \n\t"
+ "gslqc1 $f18, $f16, 0x70(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f16 \n\t"
+ "paddh $f30, $f30, $f18 \n\t"
+ "gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f8 \n\t"
+ "paddh $f30, $f30, $f10 \n\t"
+ "pandn $f8, $f4, $f20 \n\t"
+ "pandn $f10, $f6, $f22 \n\t"
+ "dmtc1 $8, $f20 \n\t"
+ "psrah $f28, $f28, $f20 \n\t"
+ "psrah $f30, $f30, $f20 \n\t"
+ "and $f16, $f4, $f28 \n\t"
+ "and $f18, $f6, $f30 \n\t"
+ "or $f16, $f16, $f8 \n\t"
+ "or $f18, $f18, $f10 \n\t"
+ "gslqc1 $f10, $f8, 0x50(%[tmp]) \n\t"
+ "packushb $f24, $f24, $f26 \n\t"
+ "packushb $f26, $f16, $f18 \n\t"
+ "gssqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
+ "paddh $f24, $f8, $f8 \n\t"
+ "paddh $f26, $f10, $f10 \n\t"
+ "dmtc1 %[iAlpha], $f20 \n\t"
+ "dmtc1 %[iBeta], $f22 \n\t"
+ "gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "paddh $f24, $f24, $f12 \n\t"
+ "paddh $f26, $f26, $f14 \n\t"
+ "mov.d $f16, $f0 \n\t"
+ "mov.d $f18, $f2 \n\t"
+ "pandn $f0, $f0, $f20 \n\t"
+ "pandn $f2, $f2, $f22 \n\t"
+ "dmtc1 $8, $f20 \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+ "psrah $f24, $f24, $f20 \n\t"
+ "psrah $f26, $f26, $f20 \n\t"
+ "and $f16, $f16, $f24 \n\t"
+ "and $f18, $f18, $f26 \n\t"
+ "or $f16, $f16, $f0 \n\t"
+ "or $f18, $f18, $f2 \n\t"
+ "gslqc1 $f2, $f0, 0x70(%[tmp]) \n\t"
+ "paddh $f20, $f0, $f0 \n\t"
+ "paddh $f22, $f2, $f2 \n\t"
+ "gslqc1 $f2, $f0, 0x40(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+ "gslqc1 $f14, $f12, 0x60(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f12 \n\t"
+ "paddh $f22, $f22, $f14 \n\t"
+ "paddh $f20, $f20, $f8 \n\t"
+ "paddh $f22, $f22, $f10 \n\t"
+ "dmtc1 $8, $f8 \n\t"
+ "psrah $f20, $f20, $f8 \n\t"
+ "psrah $f22, $f22, $f8 \n\t"
+ "and $f12, $f4, $f20 \n\t"
+ "and $f14, $f6, $f22 \n\t"
+ "pandn $f4, $f4, $f0 \n\t"
+ "pandn $f6, $f6, $f2 \n\t"
+ "or $f12, $f12, $f4 \n\t"
+ "or $f14, $f14, $f6 \n\t"
+ "packushb $f16, $f16, $f18 \n\t"
+ "packushb $f18, $f12, $f14 \n\t"
+ "gssqc1 $f18, $f16, 0xa0(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 0x0($11) \n\t"
+ "gslqc1 $f6, $f4, 0x10($11) \n\t"
+ "gslqc1 $f10, $f8, 0x20($11) \n\t"
+ "gslqc1 $f14, $f12, 0x30($11) \n\t"
+ "mov.d $f26, $f2 \n\t"
+ "punpckhbh $f2, $f0, $f4 \n\t"
+ "punpcklbh $f0, $f0, $f4 \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+ "mov.d $f30, $f10 \n\t"
+ "punpckhbh $f10, $f8, $f12 \n\t"
+ "punpcklbh $f8, $f8, $f12 \n\t"
+ "punpcklbh $f28, $f30, $f14 \n\t"
+ "punpckhbh $f30, $f30, $f14 \n\t"
+ "punpcklhw $f16, $f2, $f10 \n\t"
+ "punpckhhw $f18, $f2, $f10 \n\t"
+ "punpcklhw $f20, $f26, $f30 \n\t"
+ "punpckhhw $f22, $f26, $f30 \n\t"
+ "punpckhhw $f2, $f0, $f8 \n\t"
+ "punpcklhw $f0, $f0, $f8 \n\t"
+ "punpckhhw $f26, $f24, $f28 \n\t"
+ "punpcklhw $f24, $f24, $f28 \n\t"
+ "punpcklwd $f4, $f2, $f26 \n\t"
+ "punpckhwd $f6, $f2, $f26 \n\t"
+ "punpcklwd $f8, $f18, $f22 \n\t"
+ "punpckhwd $f10, $f18, $f22 \n\t"
+ "punpckhwd $f2, $f0, $f24 \n\t"
+ "punpcklwd $f0, $f0, $f24 \n\t"
+ "punpckhwd $f18, $f16, $f20 \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f20, $f2 \n\t"
+ "mov.d $f24, $f6 \n\t"
+ "mov.d $f2, $f16 \n\t"
+ "mov.d $f22, $f18 \n\t"
+ "mov.d $f6, $f8 \n\t"
+ "mov.d $f26, $f10 \n\t"
+ "dli %[iAlpha], 0x20 \n\t"
+ "dmtc1 %[iAlpha], $f8 \n\t"
+ "gsswlc1 $f0, 0x3($9) \n\t"
+ "gsswrc1 $f0, 0x0($9) \n\t"
+ "daddu $12, $9, %[iStride] \n\t"
+ "gsswlc1 $f20, 0x3($12) \n\t"
+ "gsswrc1 $f20, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsswlc1 $f4, 0x3($12) \n\t"
+ "gsswrc1 $f4, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsswlc1 $f24, 0x3($12) \n\t"
+ "gsswrc1 $f24, 0x0($12) \n\t"
+ "dsrl $f0, $f0, $f8 \n\t"
+ "dsrl $f20, $f20, $f8 \n\t"
+ "dsrl $f4, $f4, $f8 \n\t"
+ "dsrl $f24, $f24, $f8 \n\t"
+ "gsswlc1 $f0, 0x3($10) \n\t"
+ "gsswrc1 $f0, 0x0($10) \n\t"
+ "daddu $13, $10, %[iStride] \n\t"
+ "daddu $8, $13, %[iStride] \n\t"
+ "gsswlc1 $f20, 0x3($13) \n\t"
+ "gsswrc1 $f20, 0x0($13) \n\t"
+ "daddu $13, $8, %[iStride] \n\t"
+ "gsswlc1 $f4, 0x3($8) \n\t"
+ "gsswrc1 $f4, 0x0($8) \n\t"
+ "gsswlc1 $f24, 0x3($13) \n\t"
+ "gsswrc1 $f24, 0x0($13) \n\t"
+ "gsswlc1 $f2, 0x3(%[pPixCb]) \n\t"
+ "gsswrc1 $f2, 0x0(%[pPixCb]) \n\t"
+ "daddu $12, %[pPixCb], %[iStride] \n\t"
+ "gsswlc1 $f22, 0x3($12) \n\t"
+ "gsswrc1 $f22, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsswlc1 $f6, 0x3($12) \n\t"
+ "gsswrc1 $f6, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsswlc1 $f26, 0x3($12) \n\t"
+ "gsswrc1 $f26, 0x0($12) \n\t"
+ "dsrl $f2, $f2, $f8 \n\t"
+ "dsrl $f22, $f22, $f8 \n\t"
+ "dsrl $f6, $f6, $f8 \n\t"
+ "dsrl $f26, $f26, $f8 \n\t"
+ "gsswlc1 $f2, 0x3(%[pPixCr]) \n\t"
+ "gsswrc1 $f2, 0x0(%[pPixCr]) \n\t"
+ "daddu $13, %[pPixCr], %[iStride] \n\t"
+ "daddu $8, $13, %[iStride] \n\t"
+ "gsswlc1 $f22, 0x3($13) \n\t"
+ "gsswrc1 $f22, 0x0($13) \n\t"
+ "daddu $13, $8, %[iStride] \n\t"
+ "gsswlc1 $f6, 0x3($8) \n\t"
+ "gsswrc1 $f6, 0x0($8) \n\t"
+ "gsswlc1 $f26, 0x3($13) \n\t"
+ "gsswrc1 $f26, 0x0($13) \n\t"
+ : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
+ : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
+ [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
+ "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
+ "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockChromaLt4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
+ int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
+ unsigned char tmp[320] __attribute__((aligned(32)));
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "daddiu %[pPixCb], %[pPixCb], -0x2 \n\t"
+ "daddiu %[pPixCr], %[pPixCr], -0x2 \n\t"
+ "daddu $8, %[pPixCb], %[iStride] \n\t"
+ "gsldlc1 $f0, 0x7(%[pPixCb]) \n\t"
+ "gsldlc1 $f4, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pPixCb]) \n\t"
+ "gsldrc1 $f4, 0x0($8) \n\t"
+ "daddu $9, $8, %[iStride] \n\t"
+ "daddu $8, $9, %[iStride] \n\t"
+ "gsldlc1 $f8, 0x7($9) \n\t"
+ "gsldlc1 $f12, 0x7($8) \n\t"
+ "gsldrc1 $f8, 0x0($9) \n\t"
+ "gsldrc1 $f12, 0x0($8) \n\t"
+ "daddu $9, $8, %[iStride] \n\t"
+
+ "daddu $10, %[pPixCr], %[iStride] \n\t"
+ "gsldlc1 $f16, 0x7(%[pPixCr]) \n\t"
+ "gsldlc1 $f20, 0x7($10) \n\t"
+ "gsldrc1 $f16, 0x0(%[pPixCr]) \n\t"
+ "gsldrc1 $f20, 0x0($10) \n\t"
+ "daddu $11, $10, %[iStride] \n\t"
+ "daddu $10, $11, %[iStride] \n\t"
+ "gsldlc1 $f24, 0x7($11) \n\t"
+ "gsldlc1 $f28, 0x7($10) \n\t"
+ "gsldrc1 $f24, 0x0($11) \n\t"
+ "gsldrc1 $f28, 0x0($10) \n\t"
+ "daddu $11, $10, %[iStride] \n\t"
+
+ "punpcklwd $f0, $f0, $f16 \n\t"
+ "punpcklwd $f4, $f4, $f20 \n\t"
+ "punpcklwd $f8, $f8, $f24 \n\t"
+ "punpcklwd $f12, $f12, $f28 \n\t"
+ "gsldlc1 $f16, 0x7($9) \n\t"
+ "gsldlc1 $f20, 0x7($11) \n\t"
+ "gsldrc1 $f16, 0x0($9) \n\t"
+ "gsldrc1 $f20, 0x0($11) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f2, $f16 \n\t"
+ "daddu $8, $9, %[iStride] \n\t"
+ "daddu $10, $11, %[iStride] \n\t"
+ "gsldlc1 $f16, 0x7($8) \n\t"
+ "gsldlc1 $f20, 0x7($10) \n\t"
+ "gsldrc1 $f16, 0x0($8) \n\t"
+ "gsldrc1 $f20, 0x0($10) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f6, $f16 \n\t"
+ "daddu $9, $8, %[iStride] \n\t"
+ "daddu $11, $10, %[iStride] \n\t"
+
+ "gsldlc1 $f16, 0x7($9) \n\t"
+ "gsldlc1 $f20, 0x7($11) \n\t"
+ "gsldrc1 $f16, 0x0($9) \n\t"
+ "gsldrc1 $f20, 0x0($11) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f10, $f16 \n\t"
+ "daddu $8, $9, %[iStride] \n\t"
+ "daddu $10, $11, %[iStride] \n\t"
+
+ "gsldlc1 $f16, 0x7($8) \n\t"
+ "gsldlc1 $f20, 0x7($10) \n\t"
+ "gsldrc1 $f16, 0x0($8) \n\t"
+ "gsldrc1 $f20, 0x0($10) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f14, $f16 \n\t"
+
+ "punpcklbh $f24, $f2, $f6 \n\t"
+ "punpckhbh $f26, $f2, $f6 \n\t"
+ "punpckhbh $f2, $f0, $f4 \n\t"
+ "punpcklbh $f0, $f0, $f4 \n\t"
+ "punpcklbh $f28, $f10, $f14 \n\t"
+ "punpckhbh $f30, $f10, $f14 \n\t"
+ "punpckhbh $f10, $f8, $f12 \n\t"
+ "punpcklbh $f8, $f8, $f12 \n\t"
+
+ "punpcklhw $f16, $f2, $f10 \n\t"
+ "punpckhhw $f18, $f2, $f10 \n\t"
+ "punpckhhw $f2, $f0, $f8 \n\t"
+ "punpcklhw $f0, $f0, $f8 \n\t"
+ "punpcklhw $f20, $f26, $f30 \n\t"
+ "punpckhhw $f22, $f26, $f30 \n\t"
+ "punpckhhw $f26, $f24, $f28 \n\t"
+ "punpcklhw $f24, $f24, $f28 \n\t"
+
+ "punpcklwd $f4, $f2, $f26 \n\t"
+ "punpckhwd $f6, $f2, $f26 \n\t"
+ "punpckhwd $f2, $f0, $f24 \n\t"
+ "punpcklwd $f0, $f0, $f24 \n\t"
+ "punpcklwd $f8, $f18, $f22 \n\t"
+ "punpckhwd $f10, $f18, $f22 \n\t"
+ "punpckhwd $f18, $f16, $f20 \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+
+ "mov.d $f20, $f2 \n\t"
+ "mov.d $f22, $f18 \n\t"
+ "mov.d $f2, $f16 \n\t"
+ "mov.d $f24, $f6 \n\t"
+ "mov.d $f26, $f10 \n\t"
+ "mov.d $f6, $f8 \n\t"
+ "daddiu $11, %[tmp], 0x70 \n\t"
+
+ "gssqc1 $f2, $f0, 0x0($11) \n\t"
+ "gssqc1 $f22, $f20, 0x10($11) \n\t"
+ "gssqc1 $f6, $f4, 0x20($11) \n\t"
+ "gssqc1 $f26, $f24, 0x30($11) \n\t"
+
+ "lb $8, 0x3(%[pTC]) \n\t"
+ "lb $9, 0x2(%[pTC]) \n\t"
+ "lb $10, 0x1(%[pTC]) \n\t"
+ "lb $11, 0x0(%[pTC]) \n\t"
+
+ "and $12, $8, 0xFFFF \n\t"
+ "dmtc1 $12, $f8 \n\t"
+
+ "and $9, $9, 0xFFFF \n\t"
+ "dmtc1 $9, $f12 \n\t"
+ "mov.d $f16, $f12 \n\t"
+
+ "and $9, $10, 0xFFFF \n\t"
+ "dmtc1 $9, $f20 \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+ "mov.d $f24, $f20 \n\t"
+ "and $9, $11, 0xFFFF \n\t"
+ "punpcklhw $f24, $f24, $f8 \n\t"
+
+ "mov.d $f4, $f8 \n\t"
+ "dmtc1 $9, $f28 \n\t"
+ "mov.d $f0, $f28 \n\t"
+
+ "punpcklhw $f28, $f28, $f12 \n\t"
+ "punpcklhw $f20, $f20, $f4 \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "punpcklhw $f28, $f28, $f20 \n\t"
+ "gslqc1 $f22, $f20, 0xA0(%[tmp]) \n\t"
+ "punpcklhw $f0, $f0, $f16 \n\t"
+ "punpcklhw $f0, $f0, $f24 \n\t"
+
+ "gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
+ "punpckhhw $f2, $f0, $f28 \n\t"
+ "punpcklhw $f0, $f0, $f28 \n\t"
+ "gslqc1 $f30, $f28, 0x80(%[tmp]) \n\t"
+ "psubh $f8, $f4, $f0 \n\t"
+ "psubh $f10, $f6, $f2 \n\t"
+ "gssqc1 $f10, $f8, 0xD0(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f8 \n\t"
+ "punpcklhw $f12, $f8, $f8 \n\t"
+ "punpcklwd $f16, $f12, $f12 \n\t"
+ "mov.d $f18, $f16 \n\t"
+
+ "dmtc1 %[iBeta], $f8 \n\t"
+ "punpcklhw $f12, $f8, $f8 \n\t"
+ "punpcklwd $f8, $f12, $f12 \n\t"
+ "mov.d $f10, $f8 \n\t"
+
+ "gslqc1 $f14, $f12, 0x90(%[tmp]) \n\t"
+ "gssqc1 $f10, $f8, 0x50(%[tmp]) \n\t"
+ "punpckhbh $f10, $f24, $f4 \n\t"
+ "punpcklbh $f8, $f24, $f4 \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+
+ "gssqc1 $f10, $f8, 0x40(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 0xB0(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
+ "punpcklbh $f8, $f28, $f4 \n\t"
+ "punpckhbh $f10, $f28, $f4 \n\t"
+ "punpcklbh $f28, $f30, $f6 \n\t"
+ "punpckhbh $f30, $f30, $f6 \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+ "punpckhbh $f14, $f12, $f4 \n\t"
+ "punpcklbh $f12, $f12, $f4 \n\t"
+ "punpckhbh $f22, $f20, $f4 \n\t"
+ "punpcklbh $f20, $f20, $f4 \n\t"
+ "gssqc1 $f30, $f28, 0xF0(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 0xC0(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0xA0(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+
+ "dli $13, 0x4 \n\t"
+ "gssqc1 $f26, $f24, 0xE0(%[tmp]) \n\t"
+ "dmtc1 $13, $f24 \n\t"
+ "punpcklhw $f28, $f24, $f24 \n\t"
+ "punpcklwd $f24, $f28, $f28 \n\t"
+ "mov.d $f26, $f24 \n\t"
+ "dli $12, 0x2 \n\t"
+ "dli $13, 0x3 \n\t"
+
+ "gssqc1 $f2, $f0, 0x20(%[tmp]) \n\t"
+ "dmfc1 %[iAlpha], $f0 \n\t"
+ "dmfc1 %[iBeta], $f2 \n\t"
+ "gssqc1 $f26, $f24, 0x30(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 0x40(%[tmp]) \n\t"
+ "psubh $f28, $f28, $f20 \n\t"
+ "psubh $f30, $f30, $f22 \n\t"
+ "pcmpgth $f24, $f0, $f4 \n\t"
+ "pcmpgth $f26, $f2, $f6 \n\t"
+
+ "dmtc1 $12, $f0 \n\t"
+ "dmtc1 $13, $f2 \n\t"
+ "gssqc1 $f26, $f24, 0x60(%[tmp]) \n\t"
+ "gslqc1 $f6, $f4, 0xD0(%[tmp]) \n\t"
+ "psubh $f24, $f12, $f8 \n\t"
+ "psubh $f26, $f14, $f10 \n\t"
+ "psllh $f24, $f24, $f0 \n\t"
+ "psllh $f26, $f26, $f0 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "psrah $f24, $f24, $f2 \n\t"
+ "psrah $f26, $f26, $f2 \n\t"
+ "pmaxsh $f4, $f4, $f24 \n\t"
+ "pmaxsh $f6, $f6, $f26 \n\t"
+
+ "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
+ "pminsh $f24, $f24, $f4 \n\t"
+ "pminsh $f26, $f26, $f6 \n\t"
+
+ "gssqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
+ "psubh $f4, $f8, $f12 \n\t"
+ "psubh $f6, $f10, $f14 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
+ "pcmpgth $f24, $f16, $f4 \n\t"
+ "pcmpgth $f26, $f18, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
+ "psubh $f4, $f4, $f8 \n\t"
+ "psubh $f6, $f6, $f10 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
+ "pcmpgth $f28, $f28, $f4 \n\t"
+ "pcmpgth $f30, $f30, $f6 \n\t"
+
+ "gslqc1 $f6, $f4, 0x50(%[tmp]) \n\t"
+ "and $f24, $f24, $f28 \n\t"
+ "and $f26, $f26, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "psubh $f20, $f20, $f12 \n\t"
+ "psubh $f22, $f22, $f14 \n\t"
+ WELS_AbsH($f20, $f22, $f20, $f22, $f0, $f2)
+ "pcmpgth $f4, $f4, $f20 \n\t"
+ "pcmpgth $f6, $f6, $f22 \n\t"
+
+ "gslqc1 $f22, $f20, 0xB0(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 0xE0(%[tmp]) \n\t"
+ "psubh $f20, $f20, $f0 \n\t"
+ "psubh $f22, $f22, $f2 \n\t"
+ "and $f24, $f24, $f4 \n\t"
+ "and $f26, $f26, $f6 \n\t"
+ "gslqc1 $f2, $f0, 0x60(%[tmp]) \n\t"
+ "and $f24, $f24, $f0 \n\t"
+ "and $f26, $f26, $f2 \n\t"
+
+ "gslqc1 $f6, $f4, 0x20(%[tmp]) \n\t"
+ "and $f4, $f4, $f24 \n\t"
+ "and $f6, $f6, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0xC0(%[tmp]) \n\t"
+ "gssqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
+ "gslqc1 $f6, $f4, 0xF0(%[tmp]) \n\t"
+
+ "dmtc1 $12, $f0 \n\t"
+ "psubh $f24, $f24, $f4 \n\t"
+ "psubh $f26, $f26, $f6 \n\t"
+ "psllh $f24, $f24, $f0 \n\t"
+ "psllh $f26, $f26, $f0 \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "gslqc1 $f2, $f0, 0x30(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f0 \n\t"
+ "paddh $f26, $f26, $f2 \n\t"
+ "dmtc1 %[iBeta], $f2 \n\t"
+
+ "dmtc1 $13, $f0 \n\t"
+ "gslqc1 $f22, $f20, 0xD0(%[tmp]) \n\t"
+ "psrah $f24, $f24, $f0 \n\t"
+ "psrah $f26, $f26, $f0 \n\t"
+ "dmtc1 %[iAlpha], $f0 \n\t"
+ "pmaxsh $f20, $f20, $f24 \n\t"
+ "pmaxsh $f22, $f22, $f26 \n\t"
+ "pminsh $f0, $f0, $f20 \n\t"
+ "pminsh $f2, $f2, $f22 \n\t"
+
+ "dmfc1 %[iAlpha], $f0 \n\t"
+ "dmfc1 %[iBeta], $f2 \n\t"
+ "gslqc1 $f22, $f20, 0xC0(%[tmp]) \n\t"
+ "psubh $f24, $f4, $f20 \n\t"
+ "psubh $f26, $f6, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
+ "pcmpgth $f16, $f16, $f24 \n\t"
+ "pcmpgth $f18, $f18, $f26 \n\t"
+
+ "gslqc1 $f26, $f24, 0xB0(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f4 \n\t"
+ "psubh $f26, $f26, $f6 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
+ "pcmpgth $f28, $f28, $f24 \n\t"
+ "pcmpgth $f30, $f30, $f26 \n\t"
+
+ "gslqc1 $f26, $f24, 0xE0(%[tmp]) \n\t"
+ "and $f16, $f16, $f28 \n\t"
+ "and $f18, $f18, $f30 \n\t"
+
+ "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f20 \n\t"
+ "psubh $f26, $f26, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
+ "pcmpgth $f28, $f28, $f24 \n\t"
+ "pcmpgth $f30, $f30, $f26 \n\t"
+ "and $f16, $f16, $f28 \n\t"
+ "and $f18, $f18, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x60(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f0 \n\t"
+ "dmtc1 %[iBeta], $f2 \n\t"
+ "and $f16, $f16, $f28 \n\t"
+ "and $f18, $f18, $f30 \n\t"
+ "and $f0, $f0, $f16 \n\t"
+ "and $f2, $f2, $f18 \n\t"
+
+ "gslqc1 $f18, $f16, 0x40(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f16 \n\t"
+ "paddh $f10, $f10, $f18 \n\t"
+ "paddh $f4, $f4, $f0 \n\t"
+ "paddh $f6, $f6, $f2 \n\t"
+ "psubh $f12, $f12, $f16 \n\t"
+ "psubh $f14, $f14, $f18 \n\t"
+ "psubh $f20, $f20, $f0 \n\t"
+ "psubh $f22, $f22, $f2 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f4, $f6 \n\t"
+ "packushb $f12, $f12, $f14 \n\t"
+ "packushb $f14, $f20, $f22 \n\t"
+
+ "gssqc1 $f10, $f8, 0x80(%[tmp]) \n\t"
+ "gssqc1 $f14, $f12, 0x90(%[tmp]) \n\t"
+ "daddiu $11, %[tmp], 0x70 \n\t"
+
+ "gslqc1 $f2, $f0, 0x0($11) \n\t"
+ "gslqc1 $f6, $f4, 0x10($11) \n\t"
+ "gslqc1 $f10, $f8, 0x20($11) \n\t"
+ "gslqc1 $f14, $f12, 0x30($11) \n\t"
+
+ "punpcklbh $f24, $f2, $f6 \n\t"
+ "punpckhbh $f26, $f2, $f6 \n\t"
+ "punpckhbh $f2, $f0, $f4 \n\t"
+ "punpcklbh $f0, $f0, $f4 \n\t"
+
+ "punpcklbh $f28, $f10, $f14 \n\t"
+ "punpckhbh $f30, $f10, $f14 \n\t"
+ "punpckhbh $f10, $f8, $f12 \n\t"
+ "punpcklbh $f8, $f8, $f12 \n\t"
+
+ "punpcklhw $f16, $f2, $f10 \n\t"
+ "punpckhhw $f18, $f2, $f10 \n\t"
+ "punpckhhw $f2, $f0, $f8 \n\t"
+ "punpcklhw $f0, $f0, $f8 \n\t"
+ "punpcklhw $f20, $f26, $f30 \n\t"
+ "punpckhhw $f22, $f26, $f30 \n\t"
+ "punpckhhw $f26, $f24, $f28 \n\t"
+ "punpcklhw $f24, $f24, $f28 \n\t"
+
+ "punpcklwd $f4, $f2, $f26 \n\t"
+ "punpckhwd $f6, $f2, $f26 \n\t"
+ "punpckhwd $f2, $f0, $f24 \n\t"
+ "punpcklwd $f0, $f0, $f24 \n\t"
+ "punpcklwd $f8, $f18, $f22 \n\t"
+ "punpckhwd $f10, $f18, $f22 \n\t"
+ "punpckhwd $f18, $f16, $f20 \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+
+ "mov.d $f20, $f2 \n\t"
+ "mov.d $f22, $f18 \n\t"
+ "mov.d $f2, $f16 \n\t"
+ "mov.d $f24, $f6 \n\t"
+ "mov.d $f26, $f10 \n\t"
+ "mov.d $f6, $f8 \n\t"
+
+ "dli %[iAlpha], 0x20 \n\t"
+ "daddu $8, %[pPixCb], %[iStride] \n\t"
+ "gsswlc1 $f0, 0x3(%[pPixCb]) \n\t"
+ "gsswlc1 $f20, 0x3($8) \n\t"
+ "gsswrc1 $f0, 0x0(%[pPixCb]) \n\t"
+ "gsswrc1 $f20, 0x0($8) \n\t"
+ "daddu $9, $8, %[iStride] \n\t"
+ "daddu $8, $9, %[iStride] \n\t"
+ "gsswlc1 $f4, 0x3($9) \n\t"
+ "gsswlc1 $f24, 0x3($8) \n\t"
+ "gsswrc1 $f4, 0x0($9) \n\t"
+ "gsswrc1 $f24, 0x0($8) \n\t"
+ "daddu $9, $8, %[iStride] \n\t"
+ "dmtc1 %[iAlpha], $f8 \n\t"
+
+ "dsrl $f0, $f0, $f8 \n\t"
+ "dsrl $f20, $f20, $f8 \n\t"
+ "dsrl $f4, $f4, $f8 \n\t"
+ "dsrl $f24, $f24, $f8 \n\t"
+ "daddu $10, %[pPixCr], %[iStride] \n\t"
+ "gsswlc1 $f0, 0x3(%[pPixCr]) \n\t"
+ "gsswlc1 $f20, 0x3($10) \n\t"
+ "gsswrc1 $f0, 0x0(%[pPixCr]) \n\t"
+ "gsswrc1 $f20, 0x0($10) \n\t"
+ "daddu $11, $10, %[iStride] \n\t"
+ "daddu $10, $11, %[iStride] \n\t"
+ "gsswlc1 $f4, 0x3($11) \n\t"
+ "gsswlc1 $f24, 0x3($10) \n\t"
+ "gsswrc1 $f4, 0x0($11) \n\t"
+ "gsswrc1 $f24, 0x0($10) \n\t"
+ "daddu $11, $10, %[iStride] \n\t"
+
+ "daddu $8, $9, %[iStride] \n\t"
+ "gsswlc1 $f2, 0x3($9) \n\t"
+ "gsswlc1 $f22, 0x3($8) \n\t"
+ "gsswrc1 $f2, 0x0($9) \n\t"
+ "gsswrc1 $f22, 0x0($8) \n\t"
+ "daddu $9, $8, %[iStride] \n\t"
+ "daddu $8, $9, %[iStride] \n\t"
+ "gsswlc1 $f6, 0x3($9) \n\t"
+ "gsswlc1 $f26, 0x3($8) \n\t"
+ "gsswrc1 $f6, 0x0($9) \n\t"
+ "gsswrc1 $f26, 0x0($8) \n\t"
+
+ "dsrl $f2, $f2, $f8 \n\t"
+ "dsrl $f22, $f22, $f8 \n\t"
+ "dsrl $f6, $f6, $f8 \n\t"
+ "dsrl $f26, $f26, $f8 \n\t"
+ "daddu $10, $11, %[iStride] \n\t"
+ "gsswlc1 $f2, 0x3($11) \n\t"
+ "gsswlc1 $f22, 0x3($10) \n\t"
+ "gsswrc1 $f2, 0x0($11) \n\t"
+ "gsswrc1 $f22, 0x0($10) \n\t"
+ "daddu $11, $10, %[iStride] \n\t"
+ "daddu $10, $11, %[iStride] \n\t"
+ "gsswlc1 $f6, 0x3($11) \n\t"
+ "gsswlc1 $f26, 0x3($10) \n\t"
+ "gsswrc1 $f6, 0x0($11) \n\t"
+ "gsswrc1 $f26, 0x0($10) \n\t"
+ : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
+ : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
+ [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp), [pTC]"r"((char *)pTC)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
+ "$f6", "$f8", "$f10", "$f12","$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
+ "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsNonZeroCount_mmi(int8_t *pNonZeroCount) {
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "gsldlc1 $f0, 0x7(%[pNonZeroCount]) \n\t"
+ "gsldlc1 $f2, 0xF(%[pNonZeroCount]) \n\t"
+ "gsldlc1 $f4, 0x17(%[pNonZeroCount]) \n\t"
+ "gsldrc1 $f4, 0x10(%[pNonZeroCount]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pNonZeroCount]) \n\t"
+ "gsldrc1 $f2, 0x8(%[pNonZeroCount]) \n\t"
+ "pcmpeqh $f8, $f8, $f8 \n\t"
+ "dli $8, 0xF \n\t"
+ "dmtc1 $8, $f6 \n\t"
+ "psrlh $f8, $f8, $f6 \n\t"
+ "packushb $f8, $f8, $f8 \n\t"
+
+ "pminub $f0, $f0, $f8 \n\t"
+ "pminub $f2, $f2, $f8 \n\t"
+ "pminub $f4, $f4, $f8 \n\t"
+ "gssdlc1 $f0, 0x7(%[pNonZeroCount]) \n\t"
+ "gssdlc1 $f2, 0xF(%[pNonZeroCount]) \n\t"
+ "gssdlc1 $f4, 0x17(%[pNonZeroCount]) \n\t"
+ "gssdrc1 $f0, 0x0(%[pNonZeroCount]) \n\t"
+ "gssdrc1 $f2, 0x8(%[pNonZeroCount]) \n\t"
+ "gssdrc1 $f4, 0x10(%[pNonZeroCount]) \n\t"
+ :
+ : [pNonZeroCount] "r"((unsigned char *)pNonZeroCount)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8"
+ );
+}
--- a/codec/common/src/cpu.cpp
+++ b/codec/common/src/cpu.cpp
@@ -307,7 +307,17 @@
WELS_CPU_NEON;
}
-#else /* Neither X86_ASM, HAVE_NEON nor HAVE_NEON_AARCH64 */
+#elif defined(mips)
+/* for loongson */
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
+#if defined(HAVE_MMI)
+ return WELS_CPU_MMI;
+#else
+ return 0;
+#endif
+}
+
+#else /* Neither X86_ASM, HAVE_NEON, HAVE_NEON_AARCH64 nor mips */
uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
return 0;
--- a/codec/common/src/deblocking_common.cpp
+++ b/codec/common/src/deblocking_common.cpp
@@ -274,3 +274,22 @@
#endif
+#ifdef HAVE_MMI
+extern "C" {
+ void DeblockLumaLt4H_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
+ ENFORCE_STACK_ALIGN_1D (uint8_t, uiBuf, 16 * 8, 16);
+
+ DeblockLumaTransposeH2V_mmi (pPixY - 4, iStride, &uiBuf[0]);
+ DeblockLumaLt4V_mmi (&uiBuf[4 * 16], 16, iAlpha, iBeta, pTc);
+ DeblockLumaTransposeV2H_mmi (pPixY - 4, iStride, &uiBuf[0]);
+ }
+
+ void DeblockLumaEq4H_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+ ENFORCE_STACK_ALIGN_1D (uint8_t, uiBuf, 16 * 8, 16);
+
+ DeblockLumaTransposeH2V_mmi (pPixY - 4, iStride, &uiBuf[0]);
+ DeblockLumaEq4V_mmi (&uiBuf[4 * 16], 16, iAlpha, iBeta);
+ DeblockLumaTransposeV2H_mmi (pPixY - 4, iStride, &uiBuf[0]);
+ }
+}
+#endif//HAVE_MMI
--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -63,6 +63,15 @@
endif
OBJS += $(COMMON_OBJSARM64)
+COMMON_ASM_MIPS64_SRCS=\
+ $(COMMON_SRCDIR)/mips64/deblock_mmi.c\
+
+COMMON_OBJSMIPS64 += $(COMMON_ASM_MIPS64_SRCS:.c=.$(OBJ))
+ifeq ($(ASM_ARCH), mips64)
+COMMON_OBJS += $(COMMON_OBJSMIPS64)
+endif
+OBJS += $(COMMON_OBJSMIPS64)
+
OBJS += $(COMMON_OBJS)
$(COMMON_SRCDIR)/%.$(OBJ): $(COMMON_SRCDIR)/%.cpp
@@ -73,6 +82,9 @@
$(COMMON_SRCDIR)/%.$(OBJ): $(COMMON_SRCDIR)/%.S
$(QUIET_CCAS)$(CCAS) $(CCASFLAGS) $(ASMFLAGS) $(INCLUDES) $(COMMON_CFLAGS) $(COMMON_INCLUDES) -c -o $@ $<
+
+$(COMMON_SRCDIR)/%.$(OBJ): $(COMMON_SRCDIR)/%.c
+ $(QUIET_CC)$(CC) $(CFLAGS) $(ASMFLAGS) $(INCLUDES) $(COMMON_CFLAGS) $(COMMON_INCLUDES) -c -o $@ $<
$(LIBPREFIX)common.$(LIBSUFFIX): $(COMMON_OBJS)
$(QUIET)rm -f $@
--- a/codec/decoder/core/src/deblocking.cpp
+++ b/codec/decoder/core/src/deblocking.cpp
@@ -1378,6 +1378,19 @@
pFunc->pfChromaDeblockingEQ4Hor = DeblockChromaEq4H_AArch64_neon;
}
#endif
+
+#if defined(HAVE_MMI)
+ if (iCpu & WELS_CPU_MMI) {
+ pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_mmi;
+ pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_mmi;
+ pFunc->pfLumaDeblockingLT4Hor = DeblockLumaLt4H_mmi;
+ pFunc->pfLumaDeblockingEQ4Hor = DeblockLumaEq4H_mmi;
+ pFunc->pfChromaDeblockingLT4Ver = DeblockChromaLt4V_mmi;
+ pFunc->pfChromaDeblockingEQ4Ver = DeblockChromaEq4V_mmi;
+ pFunc->pfChromaDeblockingLT4Hor = DeblockChromaLt4H_mmi;
+ pFunc->pfChromaDeblockingEQ4Hor = DeblockChromaEq4H_mmi;
+ }
+#endif//HAVE_MMI
}
} // namespace WelsDec
--- a/test/decoder/DecUT_Deblock.cpp
+++ b/test/decoder/DecUT_Deblock.cpp
@@ -146,3 +146,20 @@
GENERATE_CHROMA_UT (ChromaEq4H_AArch64_neon, DeblockChromaEq4H_AArch64_neon_wrap, DeblockChromaEq4H_c_wrap,
WELS_CPU_NEON, 1)
#endif
+
+#if defined(HAVE_MMI)
+WRAP_LUMA_FUNC (DeblockLumaEq4V_mmi)
+WRAP_LUMA_FUNC (DeblockLumaEq4H_mmi)
+WRAP_CHROMA_FUNC (DeblockChromaEq4V_mmi)
+WRAP_CHROMA_FUNC (DeblockChromaEq4H_mmi)
+
+GENERATE_LUMA_UT (LumaLt4V_mmi, DeblockLumaLt4V_mmi, DeblockLumaLt4V_c, WELS_CPU_MMI, 0)
+GENERATE_LUMA_UT (LumaLt4H_mmi, DeblockLumaLt4H_mmi, DeblockLumaLt4H_c, WELS_CPU_MMI, 1)
+GENERATE_LUMA_UT (LumaEq4V_mmi, DeblockLumaEq4V_mmi_wrap, DeblockLumaEq4V_c_wrap, WELS_CPU_MMI, 0)
+GENERATE_LUMA_UT (LumaEq4H_mmi, DeblockLumaEq4H_mmi_wrap, DeblockLumaEq4H_c_wrap, WELS_CPU_MMI, 1)
+
+GENERATE_CHROMA_UT (ChromaLt4V_mmi, DeblockChromaLt4V_mmi, DeblockChromaLt4V_c, WELS_CPU_MMI, 0)
+GENERATE_CHROMA_UT (ChromaLt4H_mmi, DeblockChromaLt4H_mmi, DeblockChromaLt4H_c, WELS_CPU_MMI, 1)
+GENERATE_CHROMA_UT (ChromaEq4V_mmi, DeblockChromaEq4V_mmi_wrap, DeblockChromaEq4V_c_wrap, WELS_CPU_MMI, 0)
+GENERATE_CHROMA_UT (ChromaEq4H_mmi, DeblockChromaEq4H_mmi_wrap, DeblockChromaEq4H_c_wrap, WELS_CPU_MMI, 1)
+#endif//HAVE_MMI
--- a/test/decoder/DecUT_DeblockCommon.cpp
+++ b/test/decoder/DecUT_DeblockCommon.cpp
@@ -540,6 +540,17 @@
DeblockingInit (&sDBFunc, 0x000004);
DB_FUNC_CPUFLAG (AArch64_neon)
#endif
+
+#ifdef HAVE_MMI
+ // pure C
+ DeblockingInit (&sDBFunc, 0x00000000);
+ DB_FUNC_CPUFLAG (c)
+
+ // mmi
+ DeblockingInit (&sDBFunc, 0x00000001);
+ DB_FUNC_CPUFLAG (mmi)
+#endif
+
}
TEST (DecoderDeblocking, WelsDeblockingFilterSlice) {