ref: 14d7bf0744bfb023544be31f853df71a2b7eebb4
parent: 4390f83cecc820beea75016b9aa74544d9d5d3e4
author: gxw <[email protected]>
date: Tue Aug 7 07:57:29 EDT 2018
Modify targets.mk generation method Edit the build/mktargets.py instead of the targets.mk directly. Rename codec/common/mips64 to codec/common/mips.
--- a/build/arch.mk
+++ b/build/arch.mk
@@ -31,10 +31,10 @@
endif
#for loongson
-ifneq ($(filter mips64, $(ARCH)),)
+ifneq ($(filter mips mips64, $(ARCH)),)
ifeq ($(USE_ASM), Yes)
-ASM_ARCH = mips64
-ASMFLAGS += -I$(SRC_PATH)codec/common/mips64/
+ASM_ARCH = mips
+ASMFLAGS += -I$(SRC_PATH)codec/common/mips/
LOONGSON3A = $(shell g++ -dM -E - < /dev/null | grep '_MIPS_TUNE ' | cut -f 3 -d " ")
ifeq ($(LOONGSON3A), "loongson3a")
CFLAGS += -DHAVE_MMI
--- a/build/mktargets.py
+++ b/build/mktargets.py
@@ -117,9 +117,16 @@
arm64files.append(file)
elif 'arm' in c:
armfiles.append(file)
+mipsfiles = []
+for file in cfiles:
+ c = file.split('/')
+ if 'mips' in c:
+ mipsfiles.append(file)
+ cfiles.remove(file)
+
f = open(OUTFILE, "w")
f.write("%s_SRCDIR=%s\n"%(PREFIX, args.directory))
@@ -169,10 +176,21 @@
f.write("endif\n")
f.write("OBJS += $(%s_OBJSARM64)\n\n"%(PREFIX))
+if len(mipsfiles) > 0:
+ f.write("%s_ASM_MIPS_SRCS=\\\n"%(PREFIX))
+ for c in mipsfiles:
+ f.write("\t$(%s_SRCDIR)/%s\\\n"%(PREFIX, c))
+ f.write("\n")
+ f.write("%s_OBJSMIPS += $(%s_ASM_MIPS_SRCS:.c=.$(OBJ))\n"%(PREFIX, PREFIX))
+ f.write("ifeq ($(ASM_ARCH), mips)\n")
+ f.write("%s_OBJS += $(%s_OBJSMIPS)\n"%(PREFIX,PREFIX))
+ f.write("endif\n")
+ f.write("OBJS += $(%s_OBJSMIPS)\n\n"%(PREFIX))
+
f.write("OBJS += $(%s_OBJS)\n\n"%(PREFIX))
write_cpp_rule_pattern(f)
-if len(cfiles) > 0:
+if len(cfiles) > 0 or len(mipsfiles) > 0:
write_c_rule_pattern(f)
if len(asm) > 0:
--- /dev/null
+++ b/codec/common/mips/deblock_mmi.c
@@ -1,0 +1,2826 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2018, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file deblock_mmi.c
+ *
+ * \brief Loongson optimize
+ *
+ * \date 20/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+void DeblockLumaLt4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
+ int32_t iBeta, int8_t *pTC) {
+ unsigned char tmp[512] __attribute__((aligned(32)));
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "dsll $8, %[iStride], 0x1 \n\t"
+ "daddu $8, $8, %[iStride] \n\t"
+ "dsubu $14, %[pPix], $8 \n\t"
+
+ "dsll $8, %[iStride], 0x1 \n\t"
+ "dsubu $9, %[pPix], $8 \n\t"
+
+ "dmtc1 %[iAlpha], $f0 \n\t"
+ "dsubu $13, %[pPix], %[iStride] \n\t"
+ "daddu %[iStride], %[iStride], %[pPix] \n\t"
+ "daddu $12, $8, %[pPix] \n\t"
+
+ "punpcklhw $f0, $f0, $f0 \n\t"
+ "lb $8, 0x0(%[pTC]) \n\t"
+ "punpcklwd $f0, $f0, $f0 \n\t"
+ "mov.d $f2, $f0 \n\t"
+ "gssqc1 $f2, $f0, 432-112(%[tmp]) \n\t"
+ "dmtc1 %[iBeta], $f0 \n\t"
+ "lb %[iAlpha], 0x1(%[pTC]) \n\t"
+ "dli %[iBeta], 0xFFFF \n\t"
+ "punpcklhw $f0, $f0, $f0 \n\t"
+ "and $10, %[iAlpha], %[iBeta] \n\t"
+ "punpcklwd $f0, $f0, $f0 \n\t"
+ "mov.d $f2, $f0 \n\t"
+ "and %[iAlpha], %[iAlpha], %[iBeta] \n\t"
+ "dmtc1 $10, $f4 \n\t"
+ "mov.d $f8, $f4 \n\t"
+ "dmtc1 %[iAlpha], $f16 \n\t"
+ "and %[iAlpha], $8, %[iBeta] \n\t"
+ "dmtc1 %[iAlpha], $f20 \n\t"
+ "mov.d $f24, $f20 \n\t"
+ "mov.d $f28, $f20 \n\t"
+ "gssqc1 $f2, $f0, 432-336(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f0 \n\t"
+
+ "lb %[iAlpha], 0x3(%[pTC]) \n\t"
+ "lb %[pTC], 0x2(%[pTC]) \n\t"
+ "dmtc1 $10, $f12 \n\t"
+ "punpcklhw $f0, $f0, $f16 \n\t"
+ "and $8, %[iAlpha], %[iBeta] \n\t"
+ "punpcklhw $f24, $f24, $f8 \n\t"
+ "punpcklhw $f20, $f20, $f4 \n\t"
+ "punpcklhw $f0, $f0, $f24 \n\t"
+ "punpcklhw $f28, $f28, $f12 \n\t"
+ "punpcklhw $f28, $f28, $f20 \n\t"
+ "punpckhhw $f2, $f0, $f28 \n\t"
+ "punpcklhw $f0, $f0, $f28 \n\t"
+ "gssqc1 $f2, $f0, 432-400(%[tmp]) \n\t"
+ "dmtc1 $8, $f0 \n\t"
+ "and %[iAlpha], %[iAlpha], %[iBeta] \n\t"
+ "mov.d $f8, $f0 \n\t"
+ "dmtc1 %[iAlpha], $f16 \n\t"
+ "and %[iAlpha], %[pTC], %[iBeta] \n\t"
+ "dmtc1 $8, $f12 \n\t"
+ "dmtc1 %[iAlpha], $f20 \n\t"
+ "punpcklhw $f20, $f20, $f0 \n\t"
+
+ "xor $f0, $f0, $f0 \n\t"
+ "dmtc1 %[iAlpha], $f24 \n\t"
+ "and %[pTC], %[pTC], %[iBeta] \n\t"
+ "punpcklhw $f24, $f24, $f8 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "dmtc1 %[pTC], $f4 \n\t"
+
+ "gslqc1 $f10, $f8, 0x0($9) \n\t"
+ "punpckhbh $f10, $f8, $f0 \n\t"
+ "punpcklbh $f8, $f8, $f0 \n\t"
+
+ "dli %[iAlpha], 0x4 \n\t"
+ "seh %[pTC], %[iAlpha] \n\t"
+ "punpcklhw $f28, $f28, $f12 \n\t"
+ "punpcklhw $f28, $f28, $f20 \n\t"
+ "gslqc1 $f22, $f20, 0x0(%[iStride]) \n\t"
+ "gslqc1 $f14, $f12, 0x0($13) \n\t"
+ "gsldxc1 $f2, 0x0($12, $0) \n\t"
+ "punpckhbh $f22, $f20, $f0 \n\t"
+ "punpcklbh $f20, $f20, $f0 \n\t"
+ "gssqc1 $f22, $f20, 432-240(%[tmp]) \n\t"
+ "punpckhbh $f22, $f2, $f0 \n\t"
+ "punpcklbh $f20, $f2, $f0 \n\t"
+ "gssqc1 $f22, $f20, 432-352(%[tmp]) \n\t"
+ "punpcklhw $f4, $f4, $f16 \n\t"
+ "gslqc1 $f18, $f16, 0x0($14) \n\t"
+ "punpcklhw $f4, $f4, $f24 \n\t"
+ "gslqc1 $f26, $f24, 0x0(%[pPix]) \n\t"
+ "punpckhhw $f6, $f4, $f28 \n\t"
+ "punpcklhw $f4, $f4, $f28 \n\t"
+ "punpckhbh $f26, $f24, $f0 \n\t"
+ "punpcklbh $f24, $f24, $f0 \n\t"
+ "punpckhbh $f14, $f12, $f0 \n\t"
+ "punpcklbh $f12, $f12, $f0 \n\t"
+ "punpckhbh $f18, $f16, $f0 \n\t"
+ "punpcklbh $f16, $f16, $f0 \n\t"
+ "psubh $f28, $f12, $f16 \n\t"
+ "psubh $f30, $f14, $f18 \n\t"
+ "gssqc1 $f18, $f16, 432-272(%[tmp]) \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f16, $f18)
+ "gslqc1 $f18, $f16, 432-336(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t"
+ "pcmpgth $f20, $f16, $f28 \n\t"
+ "pcmpgth $f22, $f18, $f30 \n\t"
+ "gssqc1 $f22, $f20, 432-288(%[tmp]) \n\t"
+ "psubh $f28, $f24, $f0 \n\t"
+ "psubh $f30, $f26, $f2 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
+ "pcmpgth $f20, $f16, $f28 \n\t"
+ "pcmpgth $f22, $f18, $f30 \n\t"
+ "gssqc1 $f22, $f20, 432-256(%[tmp]) \n\t"
+ "pavgh $f20, $f12, $f24 \n\t"
+ "pavgh $f22, $f14, $f26 \n\t"
+ "gssqc1 $f22, $f20, 432-304(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-288(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 432-256(%[tmp]) \n\t"
+ "psubh $f20, $f20, $f28 \n\t"
+ "psubh $f22, $f22, $f30 \n\t"
+ "psubh $f20, $f20, $f0 \n\t"
+ "psubh $f22, $f22, $f2 \n\t"
+ "gssqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 432-240(%[tmp]) \n\t"
+ "psubh $f20, $f24, $f12 \n\t"
+ "psubh $f22, $f26, $f14 \n\t"
+ "gssqc1 $f26, $f24, 432-32(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f0 \n\t"
+ "psubh $f26, $f26, $f2 \n\t"
+ "gssqc1 $f22, $f20, 432-384(%[tmp]) \n\t"
+ WELS_AbsH($f28, $f30, $f20, $f22, $f28, $f30)
+ "gslqc1 $f22, $f20, 432-112(%[tmp]) \n\t"
+ "pcmpgth $f20, $f20, $f28 \n\t"
+ "pcmpgth $f22, $f22, $f30 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+ "pcmpgth $f28, $f16, $f24 \n\t"
+ "pcmpgth $f30, $f18, $f26 \n\t"
+
+ "xor $f0, $f0, $f0 \n\t"
+ "and $f20, $f20, $f28 \n\t"
+ "and $f22, $f22, $f30 \n\t"
+ "psubh $f24, $f12, $f8 \n\t"
+ "psubh $f26, $f14, $f10 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+ "pcmpgth $f28, $f16, $f24 \n\t"
+ "pcmpgth $f30, $f18, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-400(%[tmp]) \n\t"
+ "and $f20, $f20, $f28 \n\t"
+ "and $f22, $f22, $f30 \n\t"
+ "pcmpgth $f28, $f24, $f0 \n\t"
+ "pcmpgth $f30, $f26, $f0 \n\t"
+ "pcmpeqh $f24, $f24, $f0 \n\t"
+ "pcmpeqh $f26, $f26, $f0 \n\t"
+ "or $f28, $f28, $f24 \n\t"
+ "or $f30, $f30, $f26 \n\t"
+ "and $f20, $f20, $f28 \n\t"
+ "and $f22, $f22, $f30 \n\t"
+ "gssqc1 $f22, $f20, 432-320(%[tmp]) \n\t"
+ "dmtc1 %[pTC], $f20 \n\t"
+ "punpckhhw $f26, $f20, $f20 \n\t"
+ "punpcklhw $f24, $f20, $f20 \n\t"
+ "punpcklwd $f20, $f24, $f24 \n\t"
+ "mov.d $f22, $f20 \n\t"
+ "gssqc1 $f22, $f20, 432-336(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
+ "psubh $f24, $f0, $f20 \n\t"
+ "dli $11, 0x2 \n\t"
+ "psubh $f26, $f0, $f22 \n\t"
+ "dmtc1 $11, $f28 \n\t"
+ "gslqc1 $f22, $f20, 432-384(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 432-240(%[tmp]) \n\t"
+ "psllh $f20, $f20, $f28 \n\t"
+ "psllh $f22, $f22, $f28 \n\t"
+ "psubh $f28, $f8, $f0 \n\t"
+ "psubh $f30, $f10, $f2 \n\t"
+ "paddh $f28, $f28, $f20 \n\t"
+ "paddh $f30, $f30, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-336(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f20 \n\t"
+ "paddh $f30, $f30, $f22 \n\t"
+ "dli $11, 0x3 \n\t"
+ "dmtc1 $11, $f20 \n\t"
+ "psrah $f28, $f28, $f20 \n\t"
+ "psrah $f30, $f30, $f20 \n\t"
+ "gslqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
+ "pmaxsh $f24, $f24, $f28 \n\t"
+ "pmaxsh $f26, $f26, $f30 \n\t"
+ "gslqc1 $f2, $f0, 432-320(%[tmp]) \n\t"
+ "pminsh $f20, $f20, $f24 \n\t"
+ "pminsh $f22, $f22, $f26 \n\t"
+
+ "and $f20, $f20, $f0 \n\t"
+ "and $f22, $f22, $f2 \n\t"
+ "gslqc1 $f26, $f24, 432-400(%[tmp]) \n\t"
+ "gssqc1 $f22, $f20, 432-64(%[tmp]) \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+ "gssqc1 $f26, $f24, 432-384(%[tmp]) \n\t"
+ "psubh $f20, $f0, $f24 \n\t"
+ "psubh $f22, $f0, $f26 \n\t"
+ "gssqc1 $f22, $f20, 432-368(%[tmp]) \n\t"
+ "mov.d $f24, $f20 \n\t"
+ "mov.d $f26, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f28 \n\t"
+ "paddh $f22, $f22, $f30 \n\t"
+ "paddh $f28, $f8, $f8 \n\t"
+ "paddh $f30, $f10, $f10 \n\t"
+ "psubh $f20, $f20, $f28 \n\t"
+ "psubh $f22, $f22, $f30 \n\t"
+ "dli $11, 0x1 \n\t"
+ "dmtc1 $11, $f28 \n\t"
+ "psrah $f20, $f20, $f28 \n\t"
+ "psrah $f22, $f22, $f28 \n\t"
+ "pmaxsh $f24, $f24, $f20 \n\t"
+ "pmaxsh $f26, $f26, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-384(%[tmp]) \n\t"
+ "pminsh $f20, $f20, $f24 \n\t"
+ "pminsh $f22, $f22, $f26 \n\t"
+
+ "gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-288(%[tmp]) \n\t"
+ "and $f20, $f20, $f24 \n\t"
+ "and $f22, $f22, $f26 \n\t"
+ "and $f20, $f20, $f28 \n\t"
+ "and $f22, $f22, $f30 \n\t"
+ "gslqc1 $f26, $f24, 432-240(%[tmp]) \n\t"
+ "gssqc1 $f22, $f20, 432-96(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 432-352(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f28 \n\t"
+ "paddh $f22, $f22, $f30 \n\t"
+ "paddh $f28, $f24, $f24 \n\t"
+ "paddh $f30, $f26, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-368(%[tmp]) \n\t"
+ "dli $11, 0x1 \n\t"
+ "psubh $f20, $f20, $f28 \n\t"
+ "dmtc1 $11, $f28 \n\t"
+ "psubh $f22, $f22, $f30 \n\t"
+
+ "psrah $f20, $f20, $f28 \n\t"
+ "psrah $f22, $f22, $f28 \n\t"
+ "gslqc1 $f30, $f28, 0x0(%[iStride]) \n\t"
+ "pmaxsh $f24, $f24, $f20 \n\t"
+ "pmaxsh $f26, $f26, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
+ "pminsh $f20, $f20, $f24 \n\t"
+ "pminsh $f22, $f22, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t"
+ "and $f20, $f20, $f24 \n\t"
+ "and $f22, $f22, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-256(%[tmp]) \n\t"
+ "and $f20, $f20, $f24 \n\t"
+ "and $f22, $f22, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x0($9) \n\t"
+ "punpcklbh $f28, $f30, $f0 \n\t"
+ "punpckhbh $f30, $f30, $f0 \n\t"
+ "gssqc1 $f30, $f28, 432-352(%[tmp]) \n\t"
+
+ "gslqc1 $f30, $f28, 0x0($12) \n\t"
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "gssqc1 $f22, $f20, 432-48(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 0x0($14) \n\t"
+ "gssqc1 $f26, $f24, 432-368(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x0($13) \n\t"
+ "punpcklbh $f28, $f30, $f0 \n\t"
+ "punpckhbh $f30, $f30, $f0 \n\t"
+ "punpcklbh $f20, $f22, $f0 \n\t"
+ "punpckhbh $f22, $f22, $f0 \n\t"
+ "gssqc1 $f30, $f28, 432-384(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "gssqc1 $f26, $f24, 432-400(%[tmp]) \n\t"
+
+ "gslqc1 $f30, $f28, 432-400(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x0(%[pPix]) \n\t"
+ "psubh $f28, $f28, $f20 \n\t"
+ "psubh $f30, $f30, $f22 \n\t"
+ "gssqc1 $f22, $f20, 432-16(%[tmp]) \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "pcmpgth $f20, $f16, $f28 \n\t"
+ "pcmpgth $f22, $f18, $f30 \n\t"
+ "gslqc1 $f30, $f28, 432-384(%[tmp]) \n\t"
+ "gssqc1 $f22, $f20, 432-288(%[tmp]) \n\t"
+
+ "psubh $f28, $f24, $f28 \n\t"
+ "psubh $f30, $f26, $f30 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
+ "pcmpgth $f20, $f16, $f28 \n\t"
+ "pcmpgth $f22, $f18, $f30 \n\t"
+ "gssqc1 $f22, $f20, 432-256(%[tmp]) \n\t"
+
+ "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 432-80(%[tmp]) \n\t"
+ "pavgh $f20, $f20, $f24 \n\t"
+ "pavgh $f22, $f22, $f26 \n\t"
+ "gssqc1 $f22, $f20, 432-304(%[tmp]) \n\t"
+
+ "gslqc1 $f22, $f20, 432-288(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-256(%[tmp]) \n\t"
+ "psubh $f20, $f4, $f20 \n\t"
+ "psubh $f22, $f6, $f22 \n\t"
+ "psubh $f20, $f20, $f28 \n\t"
+ "psubh $f22, $f22, $f30 \n\t"
+ "gssqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-352(%[tmp]) \n\t"
+ "psubh $f20, $f24, $f20 \n\t"
+ "psubh $f22, $f26, $f22 \n\t"
+ "psubh $f24, $f24, $f28 \n\t"
+ "psubh $f26, $f26, $f30 \n\t"
+ "gssqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
+ "mov.d $f28, $f20 \n\t"
+ "mov.d $f30, $f22 \n\t"
+ WELS_AbsH($f28, $f30, $f20, $f22, $f0, $f2)
+ "gslqc1 $f22, $f20, 432-112(%[tmp]) \n\t"
+ "pcmpgth $f20, $f20, $f28 \n\t"
+ "pcmpgth $f22, $f22, $f30 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+ "pcmpgth $f28, $f16, $f24 \n\t"
+ "pcmpgth $f30, $f18, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-368(%[tmp]) \n\t"
+
+ "and $f20, $f20, $f28 \n\t"
+ "and $f22, $f22, $f30 \n\t"
+ "gslqc1 $f30, $f28, 432-400(%[tmp]) \n\t"
+ "psubh $f28, $f28, $f24 \n\t"
+ "psubh $f30, $f30, $f26 \n\t"
+ "gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f0 \n\t"
+ "psubh $f26, $f26, $f2 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f0, $f2)
+ "pcmpgth $f16, $f16, $f28 \n\t"
+ "pcmpgth $f18, $f18, $f30 \n\t"
+ "gslqc1 $f30, $f28, 432-96(%[tmp]) \n\t"
+ "and $f20, $f20, $f16 \n\t"
+ "and $f22, $f22, $f18 \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+
+ "paddh $f8, $f8, $f28 \n\t"
+ "paddh $f10, $f10, $f30 \n\t"
+ "pcmpgth $f16, $f4, $f0 \n\t"
+ "pcmpgth $f18, $f6, $f0 \n\t"
+ "pcmpeqh $f28, $f4, $f0 \n\t"
+ "pcmpeqh $f30, $f6, $f0 \n\t"
+ "or $f16, $f16, $f28 \n\t"
+ "or $f18, $f18, $f30 \n\t"
+ "and $f20, $f20, $f16 \n\t"
+ "and $f22, $f22, $f18 \n\t"
+ "gslqc1 $f18, $f16, 432-224(%[tmp]) \n\t"
+ "gssqc1 $f22, $f20, 432-320(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
+ "dli $11, 0x2 \n\t"
+ "psubh $f28, $f0, $f16 \n\t"
+ "psubh $f30, $f0, $f18 \n\t"
+ "psubh $f2, $f0, $f6 \n\t"
+ "psubh $f0, $f0, $f4 \n\t"
+ "dmfc1 %[iAlpha], $f28 \n\t"
+ "dmtc1 $11, $f28 \n\t"
+ "psllh $f20, $f20, $f28 \n\t"
+ "psllh $f22, $f22, $f28 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-336(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-368(%[tmp]) \n\t"
+ "dli $11, 0x3 \n\t"
+ "gssqc1 $f2, $f0, 432-336(%[tmp]) \n\t"
+ "dmfc1 %[iAlpha], $f0 \n\t"
+ "dmtc1 $11, $f0 \n\t"
+ "psrah $f24, $f24, $f0 \n\t"
+ "psrah $f26, $f26, $f0 \n\t"
+ "dmtc1 %[iAlpha], $f0 \n\t"
+ "pmaxsh $f28, $f28, $f24 \n\t"
+ "pmaxsh $f30, $f30, $f26 \n\t"
+ "pminsh $f16, $f16, $f28 \n\t"
+ "pminsh $f18, $f18, $f30 \n\t"
+ "gslqc1 $f30, $f28, 432-320(%[tmp]) \n\t"
+ "and $f16, $f16, $f28 \n\t"
+ "and $f18, $f18, $f30 \n\t"
+ "mov.d $f24, $f0 \n\t"
+ "mov.d $f26, $f2 \n\t"
+ "gslqc1 $f2, $f0, 432-16(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t"
+ "paddh $f0, $f0, $f28 \n\t"
+ "paddh $f2, $f2, $f30 \n\t"
+ "gssqc1 $f18, $f16, 432-272(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 432-368(%[tmp]) \n\t"
+ "dli $11, 0x1 \n\t"
+ "paddh $f16, $f16, $f16 \n\t"
+ "paddh $f18, $f18, $f18 \n\t"
+ "psubh $f0, $f0, $f16 \n\t"
+ "psubh $f2, $f2, $f18 \n\t"
+
+ "dmtc1 $11, $f28 \n\t"
+ "gslqc1 $f18, $f16, 432-64(%[tmp]) \n\t"
+ "psrah $f0, $f0, $f28 \n\t"
+ "psrah $f2, $f2, $f28 \n\t"
+ "pmaxsh $f24, $f24, $f0 \n\t"
+ "pmaxsh $f26, $f26, $f2 \n\t"
+ "gslqc1 $f2, $f0, 432-400(%[tmp]) \n\t"
+ "pminsh $f28, $f4, $f24 \n\t"
+ "pminsh $f30, $f6, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t"
+ "and $f28, $f28, $f24 \n\t"
+ "and $f30, $f30, $f26 \n\t"
+ "dmfc1 %[iAlpha], $f24 \n\t"
+ "dmfc1 %[iBeta], $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-288(%[tmp]) \n\t"
+ "and $f28, $f28, $f24 \n\t"
+ "and $f30, $f30, $f26 \n\t"
+ "paddh $f20, $f20, $f28 \n\t"
+ "paddh $f22, $f22, $f30 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f20, $f22 \n\t"
+ "gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
+ "paddh $f0, $f0, $f20 \n\t"
+ "paddh $f2, $f2, $f22 \n\t"
+ "paddh $f12, $f12, $f16 \n\t"
+ "paddh $f14, $f14, $f18 \n\t"
+ "packushb $f12, $f12, $f14 \n\t"
+ "packushb $f14, $f0, $f2 \n\t"
+
+ "gslqc1 $f2, $f0, 432-32(%[tmp]) \n\t"
+ "psubh $f0, $f0, $f16 \n\t"
+ "psubh $f2, $f2, $f18 \n\t"
+ "gslqc1 $f18, $f16, 432-80(%[tmp]) \n\t"
+ "psubh $f16, $f16, $f20 \n\t"
+ "gslqc1 $f26, $f24, 432-48(%[tmp]) \n\t"
+ "psubh $f18, $f18, $f22 \n\t"
+
+ "gslqc1 $f22, $f20, 432-240(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f24 \n\t"
+ "paddh $f22, $f22, $f26 \n\t"
+ "gslqc1 $f26, $f24, 432-304(%[tmp]) \n\t"
+ "packushb $f0, $f0, $f2 \n\t"
+ "packushb $f2, $f16, $f18 \n\t"
+ "gslqc1 $f18, $f16, 432-384(%[tmp]) \n\t"
+ "paddh $f16, $f16, $f24 \n\t"
+ "paddh $f18, $f18, $f26 \n\t"
+ "gssqc1 $f2, $f0, 480-208(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t"
+ "mov.d $f28, $f0 \n\t"
+ "mov.d $f30, $f2 \n\t"
+ "paddh $f0, $f0, $f0 \n\t"
+ "paddh $f2, $f2, $f2 \n\t"
+
+ "dmtc1 %[iAlpha], $f24 \n\t"
+ "dmtc1 %[iBeta], $f26 \n\t"
+
+ "psubh $f16, $f16, $f0 \n\t"
+ "psubh $f18, $f18, $f2 \n\t"
+ "dli $11, 0x1 \n\t"
+ "gslqc1 $f2, $f0, 432-336(%[tmp]) \n\t"
+ "gssqc1 $f10, $f8, 0x0($9) \n\t"
+ "dmtc1 $11, $f8 \n\t"
+ "psrah $f16, $f16, $f8 \n\t"
+ "psrah $f18, $f18, $f8 \n\t"
+ "pmaxsh $f0, $f0, $f16 \n\t"
+ "pmaxsh $f2, $f2, $f18 \n\t"
+ "pminsh $f4, $f4, $f0 \n\t"
+ "pminsh $f6, $f6, $f2 \n\t"
+ "gslqc1 $f2, $f0, 480-208(%[tmp]) \n\t"
+
+ "gslqc1 $f10, $f8, 428-256+4(%[tmp]) \n\t"
+ "and $f4, $f4, $f24 \n\t"
+ "and $f6, $f6, $f26 \n\t"
+ "and $f4, $f4, $f8 \n\t"
+ "and $f6, $f6, $f10 \n\t"
+ "gssqc1 $f14, $f12, 0x0($13) \n\t"
+ "paddh $f28, $f28, $f4 \n\t"
+ "paddh $f30, $f30, $f6 \n\t"
+ "packushb $f20, $f20, $f22 \n\t"
+ "packushb $f22, $f28, $f30 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPix]) \n\t"
+ "gssqc1 $f22, $f20, 0x0(%[iStride]) \n\t"
+ : [pPix]"+&r"((unsigned char *)pPix)
+ : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
+ [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
+ "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+ "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockLumaTransposeH2V_mmi(uint8_t *pPixY, int32_t iStride,
+ uint8_t *pDst) {
+ BACKUP_REG;
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "dsll $8, %[iStride], 0x3 \n\t"
+ "daddu $8, $8, %[pPixY] \n\t"
+
+ "daddu $9, %[pPixY], %[iStride] \n\t"
+ "daddu $10, $8, %[iStride] \n\t"
+ "gsldlc1 $f0, 0x7(%[pPixY]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldlc1 $f4, 0x7($9) \n\t"
+ "gsldlc1 $f6, 0x7($10) \n\t"
+ "gsldrc1 $f0, 0x0(%[pPixY]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "gsldrc1 $f4, 0x0($9) \n\t"
+ "gsldrc1 $f6, 0x0($10) \n\t"
+ "daddu %[pPixY], $9, %[iStride] \n\t"
+ "daddu $8, $10, %[iStride] \n\t"
+ "daddu $9, %[pPixY], %[iStride] \n\t"
+ "daddu $10, $8, %[iStride] \n\t"
+ "gsldlc1 $f8, 0x7(%[pPixY]) \n\t"
+ "gsldlc1 $f10, 0x7($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7($10) \n\t"
+ "gsldrc1 $f8, 0x0(%[pPixY]) \n\t"
+ "gsldrc1 $f10, 0x0($8) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0($10) \n\t"
+
+ "daddu %[pPixY], $9, %[iStride] \n\t"
+ "daddu $8, $10, %[iStride] \n\t"
+ "daddu $9, %[pPixY], %[iStride] \n\t"
+ "daddu $10, $8, %[iStride] \n\t"
+ "gsldlc1 $f16, 0x7(%[pPixY]) \n\t"
+ "gsldlc1 $f18, 0x7($8) \n\t"
+ "gsldlc1 $f20, 0x7($9) \n\t"
+ "gsldlc1 $f22, 0x7($10) \n\t"
+ "gsldrc1 $f16, 0x0(%[pPixY]) \n\t"
+ "gsldrc1 $f18, 0x0($8) \n\t"
+ "gsldrc1 $f20, 0x0($9) \n\t"
+ "gsldrc1 $f22, 0x0($10) \n\t"
+ "daddu %[pPixY], $9, %[iStride] \n\t"
+ "daddu $8, $10, %[iStride] \n\t"
+ "daddu $9, %[pPixY], %[iStride] \n\t"
+ "daddu $10, $8, %[iStride] \n\t"
+ "gsldlc1 $f24, 0x7(%[pPixY]) \n\t"
+ "gsldlc1 $f26, 0x7($8) \n\t"
+
+ "gsldlc1 $f28, 0x7($9) \n\t"
+ "gsldlc1 $f30, 0x7($10) \n\t"
+ "gsldrc1 $f24, 0x0(%[pPixY]) \n\t"
+ "gsldrc1 $f26, 0x0($8) \n\t"
+ "gsldrc1 $f28, 0x0($9) \n\t"
+ "gsldrc1 $f30, 0x0($10) \n\t"
+
+ MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
+ $f14, $f16, $f18, $f20, $f22, $f24,
+ $f26, $f28, $f30, $9, $10)
+
+ "gssqc1 $f18, $f16, 0x0(%[pDst]) \n\t"
+ "gssqc1 $f10, $f8, 0x10(%[pDst]) \n\t"
+ "gssqc1 $f14, $f12, 0x20(%[pDst]) \n\t"
+ "gssqc1 $f30, $f28, 0x30(%[pDst]) \n\t"
+ "gssqc1 $f22, $f20, 0x40(%[pDst]) \n\t"
+ "gssqc1 $f6, $f4, 0x50(%[pDst]) \n\t"
+ "gssqc1 $f26, $f24, 0x60(%[pDst]) \n\t"
+ "gssqc1 $f2, $f0, 0x70(%[pDst]) \n\t"
+ : [pPixY] "+&r"((unsigned char *)pPixY)
+ : [iStride] "r"((int)iStride), [pDst] "r"((unsigned char *)pDst)
+ : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
+ "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
+ "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockLumaTransposeV2H_mmi(uint8_t *pPixY, int32_t iStride,
+ uint8_t *pSrc) {
+ BACKUP_REG;
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pSrc]) \n\t"
+ "gslqc1 $f6, $f4, 0x10(%[pSrc]) \n\t"
+ "gslqc1 $f10, $f8, 0x20(%[pSrc]) \n\t"
+ "gslqc1 $f14, $f12, 0x30(%[pSrc]) \n\t"
+ "gslqc1 $f18, $f16, 0x40(%[pSrc]) \n\t"
+ "gslqc1 $f22, $f20, 0x50(%[pSrc]) \n\t"
+ "gslqc1 $f26, $f24, 0x60(%[pSrc]) \n\t"
+ "gslqc1 $f30, $f28, 0x70(%[pSrc]) \n\t"
+
+ MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
+ $f14, $f16, $f18, $f20, $f22, $f24,
+ $f26, $f28, $f30, $9, $10)
+
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f16, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f8, 0x7($8) \n\t"
+ "gssdrc1 $f16, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f8, 0x0($8) \n\t"
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f12, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f28, 0x7($8) \n\t"
+ "gssdrc1 $f12, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f28, 0x0($8) \n\t"
+
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f20, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f4, 0x7($8) \n\t"
+ "gssdrc1 $f20, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f4, 0x0($8) \n\t"
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f24, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f0, 0x7($8) \n\t"
+ "gssdrc1 $f24, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f0, 0x0($8) \n\t"
+
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f18, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f10, 0x7($8) \n\t"
+ "gssdrc1 $f18, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f10, 0x0($8) \n\t"
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f14, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f30, 0x7($8) \n\t"
+ "gssdrc1 $f14, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f30, 0x0($8) \n\t"
+
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f22, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f6, 0x7($8) \n\t"
+ "gssdrc1 $f22, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f6, 0x0($8) \n\t"
+ "daddu %[pPixY], $8, %[iStride] \n\t"
+ "daddu $8, %[pPixY], %[iStride] \n\t"
+ "gssdlc1 $f26, 0x7(%[pPixY]) \n\t"
+ "gssdlc1 $f2, 0x7($8) \n\t"
+ "gssdrc1 $f26, 0x0(%[pPixY]) \n\t"
+ "gssdrc1 $f2, 0x0($8) \n\t"
+ : [pPixY] "+&r"((unsigned char *)pPixY)
+ : [iStride] "r"((int)iStride), [pSrc] "r"((unsigned char *)pSrc)
+ : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
+ "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
+ "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockLumaEq4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
+ int32_t iBeta) {
+ unsigned char tmp[720] __attribute__((aligned(32)));
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "dsll $11, %[iStride], 0x2 \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+ "daddu $14, %[iStride], %[pPix] \n\t"
+ "dsubu $8, %[pPix], $11 \n\t"
+ "gslqc1 $f14, $f12, 0x0($8) \n\t"
+ "gslqc1 $f22, $f20, 0x0(%[pPix]) \n\t"
+ "daddu $9, %[iStride], %[iStride] \n\t"
+ "daddu $10, $9, %[iStride] \n\t"
+ "move $12, $9 \n\t"
+ "dsubu $8, %[pPix], $9 \n\t"
+ "gslqc1 $f6, $f4, 0x0($8) \n\t"
+ "dsubu $9, %[pPix], %[iStride] \n\t"
+ "gslqc1 $f18, $f16, 0x0($9) \n\t"
+ "daddu $13, %[iStride], %[pPix] \n\t"
+
+ "move %[iStride], $12 \n\t"
+ "daddu $15, $12, %[pPix] \n\t"
+
+ "daddu $12, %[pPix], $10 \n\t"
+ "dsubu $11, %[pPix], $10 \n\t"
+
+ "gslqc1 $f26, $f24, 0x0($11) \n\t"
+ "daddu %[iStride], %[iStride], %[pPix] \n\t"
+ "dmtc1 %[iAlpha], $f0 \n\t"
+
+ "punpcklhw $f28, $f0, $f0 \n\t"
+ "punpcklwd $f0, $f28, $f28 \n\t"
+ "mov.d $f2, $f0 \n\t"
+ "gssqc1 $f2, $f0, 640-320(%[tmp]) \n\t"
+ "dmtc1 %[iBeta], $f0 \n\t"
+ "gsldxc1 $f10, 0x0($15, $0) \n\t"
+ "punpcklhw $f28, $f0, $f0 \n\t"
+ "punpcklwd $f0, $f28, $f28 \n\t"
+ "punpckhbh $f30, $f10, $f8 \n\t"
+ "mov.d $f2, $f0 \n\t"
+
+ "punpcklbh $f28, $f10, $f8 \n\t"
+ "gssqc1 $f2, $f0, 640-512(%[tmp]) \n\t"
+ "gssqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
+ "mov.d $f0, $f4 \n\t"
+ "gssqc1 $f22, $f20, 704-272(%[tmp]) \n\t"
+ "gssqc1 $f6, $f4, 672-272(%[tmp]) \n\t"
+ "mov.d $f4, $f16 \n\t"
+ "punpckhbh $f22, $f20, $f8 \n\t"
+ "punpcklbh $f20, $f20, $f8 \n\t"
+ "punpckhbh $f6, $f4, $f8 \n\t"
+ "punpcklbh $f4, $f4, $f8 \n\t"
+
+ "psubh $f28, $f20, $f4 \n\t"
+ "psubh $f30, $f22, $f6 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f2, $f10)
+ "gssqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
+ "punpckhbh $f2, $f0, $f8 \n\t"
+ "punpcklbh $f0, $f0, $f8 \n\t"
+ "gssqc1 $f18, $f16, 688-272(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 0x0($14) \n\t"
+ "gssqc1 $f2, $f0, 640-480(%[tmp]) \n\t"
+
+ "psubh $f28, $f4, $f0 \n\t"
+ "psubh $f30, $f6, $f2 \n\t"
+
+ "gslqc1 $f2, $f0, 640-512(%[tmp]) \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f18, $f10)
+ "punpckhbh $f18, $f16, $f8 \n\t"
+ "punpcklbh $f16, $f16, $f8 \n\t"
+ "pcmpgth $f0, $f0, $f28 \n\t"
+ "pcmpgth $f2, $f2, $f30 \n\t"
+ "gssqc1 $f18, $f16, 640-384(%[tmp]) \n\t"
+ "psubh $f28, $f20, $f16 \n\t"
+ "psubh $f30, $f22, $f18 \n\t"
+ "gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 656-272(%[tmp]) \n\t"
+ "punpckhbh $f26, $f24, $f8 \n\t"
+ "punpcklbh $f24, $f24, $f8 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "gssqc1 $f26, $f24, 640-368(%[tmp]) \n\t"
+ "gssqc1 $f6, $f4, 640-144(%[tmp]) \n\t"
+ "gssqc1 $f22, $f20, 640-400(%[tmp]) \n\t"
+ "pcmpgth $f16, $f16, $f28 \n\t"
+ "pcmpgth $f18, $f18, $f30 \n\t"
+ "and $f0, $f0, $f16 \n\t"
+ "and $f2, $f2, $f18 \n\t"
+ "gslqc1 $f18, $f16, 640-320(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
+ "dli %[iAlpha], 0x2 \n\t"
+ "dli %[iBeta], 0x2 \n\t"
+ "pcmpgth $f16, $f16, $f28 \n\t"
+ "pcmpgth $f18, $f18, $f30 \n\t"
+ "and $f0, $f0, $f16 \n\t"
+ "and $f2, $f2, $f18 \n\t"
+ "dmtc1 %[iAlpha], $f16 \n\t"
+ "dmtc1 %[iBeta], $f10 \n\t"
+ "gssqc1 $f2, $f0, 736-272(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 640-320(%[tmp]) \n\t"
+
+ "punpcklhw $f28, $f16, $f16 \n\t"
+ "psrah $f16, $f0, $f10 \n\t"
+ "psrah $f18, $f2, $f10 \n\t"
+ "punpcklwd $f28, $f28, $f28 \n\t"
+ "mov.d $f30, $f28 \n\t"
+ "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
+ "paddh $f16, $f16, $f28 \n\t"
+ "paddh $f18, $f18, $f30 \n\t"
+ "gssqc1 $f18, $f16, 640-576(%[tmp]) \n\t"
+ "pcmpgth $f16, $f16, $f8 \n\t"
+ "pcmpgth $f18, $f18, $f10 \n\t"
+ "gssqc1 $f18, $f16, 640-560(%[tmp]) \n\t"
+
+ "gssqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t"
+ "psubh $f28, $f4, $f24 \n\t"
+ "psubh $f30, $f6, $f26 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
+ "pcmpgth $f16, $f16, $f28 \n\t"
+ "pcmpgth $f18, $f18, $f30 \n\t"
+
+ "gslqc1 $f2, $f0, 640-416(%[tmp]) \n\t"
+ "and $f16, $f16, $f8 \n\t"
+ "and $f18, $f18, $f10 \n\t"
+ "gssqc1 $f18, $f16, 640-544(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t"
+ "psubh $f28, $f20, $f0 \n\t"
+ "psubh $f30, $f22, $f2 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
+ "pcmpgth $f16, $f16, $f28 \n\t"
+ "pcmpgth $f18, $f18, $f30 \n\t"
+
+ "and $f16, $f16, $f8 \n\t"
+ "and $f18, $f18, $f10 \n\t"
+ "gssqc1 $f18, $f16, 640-560(%[tmp]) \n\t"
+
+ "gslqc1 $f18, $f16, 640-544(%[tmp]) \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+ "pandn $f16, $f16, $f24 \n\t"
+ "dli %[iAlpha], 0x4 \n\t"
+ "pandn $f18, $f18, $f26 \n\t"
+ "gssqc1 $f18, $f16, 640-16(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f16 \n\t"
+ "punpcklhw $f28, $f16, $f16 \n\t"
+ "dli %[iAlpha], 0x1 \n\t"
+ "punpckhbh $f18, $f12, $f8 \n\t"
+ "dmtc1 %[iAlpha], $f30 \n\t"
+ "punpcklbh $f16, $f12, $f8 \n\t"
+ "psllh $f16, $f16, $f30 \n\t"
+ "psllh $f18, $f18, $f30 \n\t"
+ "paddh $f16, $f16, $f24 \n\t"
+ "paddh $f18, $f18, $f26 \n\t"
+ "gslqc1 $f2, $f0, 640-480(%[tmp]) \n\t"
+ "paddh $f16, $f16, $f24 \n\t"
+ "paddh $f18, $f18, $f26 \n\t"
+ "paddh $f16, $f16, $f24 \n\t"
+ "paddh $f18, $f18, $f26 \n\t"
+ "paddh $f16, $f16, $f0 \n\t"
+ "paddh $f18, $f18, $f2 \n\t"
+
+ "gslqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
+ "punpcklwd $f28, $f28, $f28 \n\t"
+ "mov.d $f30, $f28 \n\t"
+ "paddh $f16, $f16, $f4 \n\t"
+ "paddh $f18, $f18, $f6 \n\t"
+ "gssqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
+ "paddh $f16, $f16, $f20 \n\t"
+ "paddh $f18, $f18, $f22 \n\t"
+ "paddh $f16, $f16, $f28 \n\t"
+ "paddh $f18, $f18, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 640-384(%[tmp]) \n\t"
+ "pandn $f24, $f24, $f28 \n\t"
+ "pandn $f26, $f26, $f30 \n\t"
+ "gssqc1 $f26, $f24, 640-80(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x0($12) \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "punpckhbh $f26, $f24, $f8 \n\t"
+ "punpcklbh $f24, $f24, $f8 \n\t"
+ "psllh $f24, $f24, $f10 \n\t"
+ "psllh $f26, $f26, $f10 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "paddh $f24, $f24, $f0 \n\t"
+ "paddh $f26, $f26, $f2 \n\t"
+
+ "dli %[iAlpha], 0x3 \n\t"
+ "gslqc1 $f30, $f28, 640-480(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 640-592(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "paddh $f24, $f24, $f0 \n\t"
+ "paddh $f26, $f26, $f2 \n\t"
+ "gslqc1 $f2, $f0, 640-560(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "psrah $f24, $f24, $f10 \n\t"
+ "psrah $f26, $f26, $f10 \n\t"
+ "and $f24, $f24, $f0 \n\t"
+ "and $f26, $f26, $f2 \n\t"
+ "gssqc1 $f26, $f24, 640-112(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
+ "pandn $f24, $f24, $f28 \n\t"
+ "pandn $f26, $f26, $f30 \n\t"
+ "gssqc1 $f26, $f24, 640-336(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 640-528(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-368(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 640-544(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "psrah $f16, $f16, $f10 \n\t"
+ "psrah $f18, $f18, $f10 \n\t"
+ "and $f16, $f16, $f0 \n\t"
+ "and $f18, $f18, $f2 \n\t"
+ "gslqc1 $f2, $f0, 640-624(%[tmp]) \n\t"
+ "paddh $f28, $f4, $f20 \n\t"
+ "paddh $f30, $f6, $f22 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "paddh $f24, $f24, $f0 \n\t"
+ "paddh $f26, $f26, $f2 \n\t"
+ "gslqc1 $f30, $f28, 640-528(%[tmp]) \n\t"
+ "dli %[iAlpha], 0x2 \n\t"
+
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "paddh $f20, $f20, $f4 \n\t"
+ "paddh $f22, $f22, $f6 \n\t"
+ "psrah $f24, $f24, $f10 \n\t"
+ "psrah $f26, $f26, $f10 \n\t"
+ "and $f28, $f28, $f24 \n\t"
+ "and $f30, $f30, $f26 \n\t"
+
+ "gslqc1 $f26, $f24, 640-384(%[tmp]) \n\t"
+ "gssqc1 $f30, $f28, 640-64(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
+ "pandn $f28, $f28, $f24 \n\t"
+ "pandn $f30, $f30, $f26 \n\t"
+ "gssqc1 $f30, $f28, 640-304(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f24 \n\t"
+ "paddh $f30, $f30, $f26 \n\t"
+ "paddh $f28, $f28, $f20 \n\t"
+ "paddh $f30, $f30, $f22 \n\t"
+ "paddh $f28, $f28, $f8 \n\t"
+ "paddh $f30, $f30, $f10 \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "gslqc1 $f22, $f20, 640-560(%[tmp]) \n\t"
+ "psrah $f28, $f28, $f10 \n\t"
+ "psrah $f30, $f30, $f10 \n\t"
+ "and $f20, $f20, $f28 \n\t"
+ "and $f22, $f22, $f30 \n\t"
+ "gssqc1 $f22, $f20, 640-32(%[tmp]) \n\t"
+
+ "gslqc1 $f22, $f20, 640-480(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 640-592(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t"
+ "paddh $f28, $f20, $f20 \n\t"
+ "paddh $f30, $f22, $f22 \n\t"
+ "paddh $f20, $f4, $f24 \n\t"
+ "paddh $f22, $f6, $f26 \n\t"
+ "paddh $f24, $f24, $f0 \n\t"
+ "paddh $f26, $f26, $f2 \n\t"
+ "paddh $f28, $f28, $f20 \n\t"
+ "paddh $f30, $f30, $f22 \n\t"
+ "paddh $f28, $f28, $f8 \n\t"
+ "paddh $f30, $f30, $f10 \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "gslqc1 $f22, $f20, 640-544(%[tmp]) \n\t"
+ "psrah $f28, $f28, $f10 \n\t"
+ "psrah $f30, $f30, $f10 \n\t"
+ "dli %[iAlpha], 0x1 \n\t"
+ "pandn $f20, $f20, $f28 \n\t"
+ "pandn $f22, $f22, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-480(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f4 \n\t"
+ "paddh $f30, $f30, $f6 \n\t"
+ "gslqc1 $f6, $f4, 640-400(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f4 \n\t"
+ "paddh $f30, $f30, $f6 \n\t"
+ "gslqc1 $f6, $f4, 640-544(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "gssqc1 $f22, $f20, 640-352(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 640-368(%[tmp]) \n\t"
+ "psllh $f28, $f28, $f10 \n\t"
+ "psllh $f30, $f30, $f10 \n\t"
+ "dli %[iAlpha], 0x3 \n\t"
+ "paddh $f28, $f28, $f24 \n\t"
+ "paddh $f30, $f30, $f26 \n\t"
+ "paddh $f20, $f20, $f28 \n\t"
+ "paddh $f22, $f22, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+
+ "dli %[iAlpha], 0x2 \n\t"
+ "gslqc1 $f30, $f28, 640-400(%[tmp]) \n\t"
+ "psrah $f20, $f20, $f10 \n\t"
+ "psrah $f22, $f22, $f10 \n\t"
+ "and $f4, $f4, $f20 \n\t"
+ "and $f6, $f6, $f22 \n\t"
+ "gslqc1 $f22, $f20, 640-480(%[tmp]) \n\t"
+ "gssqc1 $f6, $f4, 640-96(%[tmp]) \n\t"
+ "gslqc1 $f6, $f4, 640-384(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-400(%[tmp]) \n\t"
+ "paddh $f24, $f4, $f4 \n\t"
+ "paddh $f26, $f6, $f6 \n\t"
+ "paddh $f4, $f4, $f8 \n\t"
+ "paddh $f6, $f6, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-144(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f20 \n\t"
+ "paddh $f30, $f30, $f22 \n\t"
+ "paddh $f4, $f4, $f8 \n\t"
+ "paddh $f6, $f6, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-592(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "paddh $f20, $f20, $f8 \n\t"
+ "paddh $f22, $f22, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "dmtc1 %[iAlpha], $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+ "dli %[iAlpha], 0x1 \n\t"
+ "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "psrah $f24, $f24, $f8 \n\t"
+ "psrah $f26, $f26, $f8 \n\t"
+ "psllh $f4, $f4, $f10 \n\t"
+ "psllh $f6, $f6, $f10 \n\t"
+ "paddh $f4, $f4, $f20 \n\t"
+ "paddh $f6, $f6, $f22 \n\t"
+ "dli %[iAlpha], 0x3 \n\t"
+
+ "gslqc1 $f22, $f20, 656-272(%[tmp]) \n\t"
+ "pandn $f28, $f28, $f24 \n\t"
+ "pandn $f30, $f30, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-416(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "gslqc1 $f6, $f4, 640-560(%[tmp]) \n\t"
+ "psrah $f24, $f24, $f10 \n\t"
+ "psrah $f26, $f26, $f10 \n\t"
+ "and $f4, $f4, $f24 \n\t"
+ "and $f6, $f6, $f26 \n\t"
+
+ "xor $f8, $f8, $f8 \n\t"
+ "gslqc1 $f26, $f24, 704-272(%[tmp]) \n\t"
+ "gssqc1 $f6, $f4, 640-128(%[tmp]) \n\t"
+ "gslqc1 $f6, $f4, 672-272(%[tmp]) \n\t"
+ "punpcklbh $f4, $f6, $f8 \n\t"
+ "punpckhbh $f6, $f6, $f8 \n\t"
+ "gssqc1 $f6, $f4, 640-448(%[tmp]) \n\t"
+ "gslqc1 $f6, $f4, 688-272(%[tmp]) \n\t"
+ "punpcklbh $f4, $f6, $f8 \n\t"
+ "punpckhbh $f6, $f6, $f8 \n\t"
+ "punpcklbh $f24, $f26, $f8 \n\t"
+ "punpckhbh $f26, $f26, $f8 \n\t"
+ "gssqc1 $f30, $f28, 640-288(%[tmp]) \n\t"
+ "punpcklbh $f20, $f22, $f8 \n\t"
+ "punpckhbh $f22, $f22, $f8 \n\t"
+ "gslqc1 $f30, $f28, 0x0($14) \n\t"
+ "gssqc1 $f6, $f4, 640-496(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 640-432(%[tmp]) \n\t"
+
+ "gsldxc1 $f0, 0x8($15, $0) \n\t"
+ "punpcklbh $f28, $f30, $f8 \n\t"
+ "punpckhbh $f30, $f30, $f8 \n\t"
+ "gssqc1 $f30, $f28, 640-464(%[tmp]) \n\t"
+
+ "punpcklbh $f28, $f0, $f8 \n\t"
+ "punpckhbh $f30, $f0, $f8 \n\t"
+ "gslqc1 $f10, $f8, 640-464(%[tmp]) \n\t"
+ "gssqc1 $f30, $f28, 640-528(%[tmp]) \n\t"
+
+ "psubh $f28, $f24, $f4 \n\t"
+ "psubh $f30, $f26, $f6 \n\t"
+ "psubh $f24, $f24, $f8 \n\t"
+ "psubh $f26, $f26, $f10 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "gslqc1 $f10, $f8, 640-16(%[tmp]) \n\t"
+ "gssqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
+ "or $f16, $f16, $f8 \n\t"
+ "or $f18, $f18, $f10 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+ "gslqc1 $f30, $f28, 640-448(%[tmp]) \n\t"
+ "psubh $f28, $f4, $f28 \n\t"
+ "psubh $f30, $f6, $f30 \n\t"
+
+ "gslqc1 $f2, $f0, 640-512(%[tmp]) \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "pcmpgth $f4, $f0, $f28 \n\t"
+ "pcmpgth $f6, $f2, $f30 \n\t"
+ "pcmpgth $f28, $f0, $f24 \n\t"
+ "pcmpgth $f30, $f2, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-320(%[tmp]) \n\t"
+ "and $f4, $f4, $f28 \n\t"
+ "and $f6, $f6, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
+ "pcmpgth $f24, $f24, $f28 \n\t"
+ "pcmpgth $f26, $f26, $f30 \n\t"
+ "and $f4, $f4, $f24 \n\t"
+ "and $f6, $f6, $f26 \n\t"
+
+ "gslqc1 $f26, $f24, 640-576(%[tmp]) \n\t"
+ "pcmpgth $f24, $f24, $f28 \n\t"
+ "pcmpgth $f26, $f26, $f30 \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+ "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
+ "punpcklbh $f12, $f14, $f8 \n\t"
+ "punpckhbh $f14, $f14, $f8 \n\t"
+ "gssqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-512(%[tmp]) \n\t"
+ "psubh $f28, $f28, $f20 \n\t"
+ "psubh $f30, $f30, $f22 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "pcmpgth $f24, $f24, $f28 \n\t"
+ "pcmpgth $f26, $f26, $f30 \n\t"
+
+ "dli %[iAlpha], 0x1 \n\t"
+ "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
+ "and $f24, $f24, $f8 \n\t"
+ "and $f26, $f26, $f10 \n\t"
+ "gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t"
+ "psubh $f28, $f28, $f8 \n\t"
+ "psubh $f30, $f30, $f10 \n\t"
+ "dmtc1 %[iAlpha], $f10 \n\t"
+
+ "psllh $f12, $f12, $f10 \n\t"
+ "psllh $f14, $f14, $f10 \n\t"
+ "gssqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-512(%[tmp]) \n\t"
+
+ "gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f20 \n\t"
+ "paddh $f14, $f14, $f22 \n\t"
+ "paddh $f12, $f12, $f20 \n\t"
+ "paddh $f14, $f14, $f22 \n\t"
+ "paddh $f12, $f12, $f20 \n\t"
+ "paddh $f14, $f14, $f22 \n\t"
+ "paddh $f12, $f12, $f8 \n\t"
+ "paddh $f14, $f14, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 640-560(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f8 \n\t"
+ "paddh $f14, $f14, $f10 \n\t"
+ WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+ "pcmpgth $f24, $f24, $f28 \n\t"
+ "pcmpgth $f26, $f26, $f30 \n\t"
+ "and $f24, $f24, $f0 \n\t"
+ "and $f26, $f26, $f2 \n\t"
+ "gssqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-544(%[tmp]) \n\t"
+
+ "gslqc1 $f2, $f0, 736-272(%[tmp]) \n\t"
+ "dli %[iAlpha], 0x3 \n\t"
+ "gslqc1 $f30, $f28, 640-368(%[tmp]) \n\t"
+ "and $f24, $f0, $f16 \n\t"
+ "and $f26, $f2, $f18 \n\t"
+ "pandn $f16, $f0, $f28 \n\t"
+ "pandn $f18, $f2, $f30 \n\t"
+ "or $f24, $f24, $f16 \n\t"
+ "or $f26, $f26, $f18 \n\t"
+ "gslqc1 $f18, $f16, 640-432(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f16 \n\t"
+ "paddh $f14, $f14, $f18 \n\t"
+ "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f28 \n\t"
+ "paddh $f14, $f14, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "psrah $f12, $f12, $f28 \n\t"
+ "psrah $f14, $f14, $f28 \n\t"
+ "and $f12, $f12, $f8 \n\t"
+ "and $f14, $f14, $f10 \n\t"
+ "pandn $f8, $f8, $f20 \n\t"
+ "pandn $f10, $f10, $f22 \n\t"
+ "or $f12, $f12, $f8 \n\t"
+ "or $f14, $f14, $f10 \n\t"
+ "and $f28, $f4, $f12 \n\t"
+ "and $f30, $f6, $f14 \n\t"
+ "gslqc1 $f14, $f12, 640-64(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-336(%[tmp]) \n\t"
+ "or $f12, $f12, $f8 \n\t"
+ "or $f14, $f14, $f10 \n\t"
+ "pandn $f8, $f4, $f20 \n\t"
+ "pandn $f10, $f6, $f22 \n\t"
+ "or $f28, $f28, $f8 \n\t"
+ "or $f30, $f30, $f10 \n\t"
+
+ "dli %[iAlpha], 0x2 \n\t"
+ "and $f8, $f0, $f12 \n\t"
+ "and $f10, $f2, $f14 \n\t"
+ "gslqc1 $f14, $f12, 640-480(%[tmp]) \n\t"
+ "pandn $f12, $f0, $f12 \n\t"
+ "pandn $f14, $f2, $f14 \n\t"
+ "or $f8, $f8, $f12 \n\t"
+ "or $f10, $f10, $f14 \n\t"
+ "packushb $f24, $f24, $f26 \n\t"
+ "packushb $f26, $f28, $f30 \n\t"
+ "gssqc1 $f10, $f8, 640-336(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 656-272(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t"
+ "paddh $f8, $f20, $f8 \n\t"
+ "paddh $f10, $f22, $f10 \n\t"
+ "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f16 \n\t"
+ "paddh $f30, $f30, $f18 \n\t"
+ "paddh $f8, $f8, $f28 \n\t"
+ "paddh $f10, $f10, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f28 \n\t"
+ "paddh $f10, $f10, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "psrah $f8, $f8, $f28 \n\t"
+ "psrah $f10, $f10, $f28 \n\t"
+ "dli %[iAlpha], 0x1 \n\t"
+ "gslqc1 $f30, $f28, 640-544(%[tmp]) \n\t"
+ "and $f24, $f24, $f8 \n\t"
+ "and $f26, $f26, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t"
+ "pandn $f28, $f28, $f8 \n\t"
+ "pandn $f30, $f30, $f10 \n\t"
+ "or $f24, $f24, $f28 \n\t"
+ "or $f26, $f26, $f30 \n\t"
+ "and $f12, $f4, $f24 \n\t"
+ "and $f14, $f6, $f26 \n\t"
+ "pandn $f24, $f4, $f8 \n\t"
+ "pandn $f26, $f6, $f10 \n\t"
+ "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f28 \n\t"
+ "paddh $f10, $f10, $f30 \n\t"
+ "paddh $f8, $f8, $f16 \n\t"
+ "paddh $f10, $f10, $f18 \n\t"
+ "or $f12, $f12, $f24 \n\t"
+ "or $f14, $f14, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-336(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "packushb $f24, $f24, $f26 \n\t"
+ "packushb $f26, $f12, $f14 \n\t"
+ "psllh $f8, $f8, $f28 \n\t"
+ "psllh $f10, $f10, $f28 \n\t"
+ "gssqc1 $f26, $f24, 672-272(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 640-96(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-352(%[tmp]) \n\t"
+ "or $f24, $f24, $f28 \n\t"
+ "or $f26, $f26, $f30 \n\t"
+ "dli %[iAlpha], 0x3 \n\t"
+
+ "and $f12, $f0, $f24 \n\t"
+ "and $f14, $f2, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-144(%[tmp]) \n\t"
+ "pandn $f24, $f0, $f24 \n\t"
+ "pandn $f26, $f2, $f26 \n\t"
+ "or $f12, $f12, $f24 \n\t"
+ "or $f14, $f14, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
+ "gssqc1 $f14, $f12, 640-352(%[tmp]) \n\t"
+ "gslqc1 $f14, $f12, 640-464(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f28 \n\t"
+ "paddh $f14, $f14, $f30 \n\t"
+ "paddh $f8, $f8, $f12 \n\t"
+ "paddh $f10, $f10, $f14 \n\t"
+ "gslqc1 $f14, $f12, 640-448(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f8 \n\t"
+ "paddh $f22, $f22, $f10 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t"
+ "psrah $f20, $f20, $f28 \n\t"
+ "psrah $f22, $f22, $f28 \n\t"
+ "and $f24, $f24, $f20 \n\t"
+ "and $f26, $f26, $f22 \n\t"
+ "gslqc1 $f22, $f20, 640-464(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f20 \n\t"
+ "paddh $f10, $f10, $f22 \n\t"
+ "gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t"
+ "dli %[iAlpha], 0x2 \n\t"
+ "paddh $f20, $f20, $f28 \n\t"
+ "paddh $f22, $f22, $f30 \n\t"
+ "paddh $f16, $f12, $f12 \n\t"
+ "paddh $f18, $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f8 \n\t"
+ "paddh $f18, $f18, $f10 \n\t"
+ "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
+ "paddh $f16, $f16, $f28 \n\t"
+ "paddh $f18, $f18, $f30 \n\t"
+ "gslqc1 $f10, $f8, 640-544(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f28 \n\t"
+ "paddh $f14, $f14, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "psrah $f16, $f16, $f28 \n\t"
+ "psrah $f18, $f18, $f28 \n\t"
+ "pandn $f8, $f8, $f16 \n\t"
+ "pandn $f10, $f10, $f18 \n\t"
+ "or $f24, $f24, $f8 \n\t"
+ "or $f26, $f26, $f10 \n\t"
+ "and $f28, $f4, $f24 \n\t"
+ "and $f30, $f6, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-496(%[tmp]) \n\t"
+ "pandn $f8, $f4, $f24 \n\t"
+ "pandn $f10, $f6, $f26 \n\t"
+ "or $f28, $f28, $f8 \n\t"
+ "or $f30, $f30, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-352(%[tmp]) \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f28, $f30 \n\t"
+ "gssqc1 $f10, $f8, 688-272(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-128(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-288(%[tmp]) \n\t"
+ "or $f8, $f8, $f28 \n\t"
+ "or $f10, $f10, $f30 \n\t"
+ "dli %[iAlpha], 0x1 \n\t"
+
+ "and $f16, $f0, $f8 \n\t"
+ "and $f18, $f2, $f10 \n\t"
+ "paddh $f20, $f20, $f24 \n\t"
+ "paddh $f22, $f22, $f26 \n\t"
+ "gslqc1 $f30, $f28, 640-400(%[tmp]) \n\t"
+ "pandn $f8, $f0, $f28 \n\t"
+ "pandn $f10, $f2, $f30 \n\t"
+ "or $f16, $f16, $f8 \n\t"
+ "or $f18, $f18, $f10 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t"
+ "dli %[iAlpha], 0x3 \n\t"
+ "psllh $f20, $f20, $f28 \n\t"
+ "psllh $f22, $f22, $f28 \n\t"
+ "paddh $f20, $f20, $f12 \n\t"
+ "paddh $f22, $f22, $f14 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "gslqc1 $f14, $f12, 640-560(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f20 \n\t"
+ "paddh $f10, $f10, $f22 \n\t"
+ "psrah $f8, $f8, $f28 \n\t"
+ "psrah $f10, $f10, $f28 \n\t"
+ "gssqc1 $f18, $f16, 640-288(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 640-560(%[tmp]) \n\t"
+ "and $f16, $f16, $f8 \n\t"
+ "and $f18, $f18, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-464(%[tmp]) \n\t"
+ "paddh $f20, $f8, $f8 \n\t"
+ "paddh $f22, $f10, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-432(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-448(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f28 \n\t"
+ "paddh $f10, $f10, $f30 \n\t"
+ "dli %[iAlpha], 0x2 \n\t"
+ "paddh $f20, $f20, $f8 \n\t"
+ "paddh $f22, $f22, $f10 \n\t"
+ "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f28 \n\t"
+ "paddh $f22, $f22, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "gslqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
+ "psrah $f20, $f20, $f28 \n\t"
+ "psrah $f22, $f22, $f28 \n\t"
+ "pandn $f12, $f12, $f20 \n\t"
+ "pandn $f14, $f14, $f22 \n\t"
+ "or $f16, $f16, $f12 \n\t"
+ "or $f18, $f18, $f14 \n\t"
+ "gslqc1 $f14, $f12, 640-32(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-304(%[tmp]) \n\t"
+ "or $f12, $f12, $f28 \n\t"
+ "or $f14, $f14, $f30 \n\t"
+ "and $f28, $f4, $f16 \n\t"
+ "and $f30, $f6, $f18 \n\t"
+ "gslqc1 $f18, $f16, 640-432(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 640-464(%[tmp]) \n\t"
+ "pandn $f8, $f4, $f16 \n\t"
+ "pandn $f10, $f6, $f18 \n\t"
+ "or $f28, $f28, $f8 \n\t"
+ "or $f30, $f30, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t"
+ "paddh $f16, $f16, $f8 \n\t"
+ "paddh $f18, $f18, $f10 \n\t"
+ "gslqc1 $f10, $f8, 640-288(%[tmp]) \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f28, $f30 \n\t"
+ "dli %[iAlpha], 0x2 \n\t"
+ "gssqc1 $f10, $f8, 704-272(%[tmp]) \n\t"
+
+ "and $f8, $f0, $f12 \n\t"
+ "and $f10, $f2, $f14 \n\t"
+ "gslqc1 $f30, $f28, 640-384(%[tmp]) \n\t"
+ "pandn $f12, $f0, $f28 \n\t"
+ "pandn $f14, $f2, $f30 \n\t"
+ "or $f8, $f8, $f12 \n\t"
+ "or $f10, $f10, $f14 \n\t"
+ "gssqc1 $f10, $f8, 640-304(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 640-464(%[tmp]) \n\t"
+ "paddh $f12, $f8, $f28 \n\t"
+ "paddh $f14, $f10, $f30 \n\t"
+ "paddh $f12, $f12, $f16 \n\t"
+ "paddh $f14, $f14, $f18 \n\t"
+ "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
+ "paddh $f12, $f12, $f28 \n\t"
+ "paddh $f14, $f14, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "psrah $f12, $f12, $f28 \n\t"
+ "psrah $f14, $f14, $f28 \n\t"
+ "and $f24, $f24, $f12 \n\t"
+ "and $f26, $f26, $f14 \n\t"
+ "gslqc1 $f14, $f12, 640-560(%[tmp]) \n\t"
+ "pandn $f16, $f12, $f20 \n\t"
+ "pandn $f18, $f14, $f22 \n\t"
+ "or $f24, $f24, $f16 \n\t"
+ "or $f26, $f26, $f18 \n\t"
+ "and $f28, $f4, $f24 \n\t"
+ "and $f30, $f6, $f26 \n\t"
+ "gslqc1 $f26, $f24, 640-304(%[tmp]) \n\t"
+ "pandn $f16, $f4, $f20 \n\t"
+ "pandn $f18, $f6, $f22 \n\t"
+ "or $f28, $f28, $f16 \n\t"
+ "or $f30, $f30, $f18 \n\t"
+ "dli %[iAlpha], 0x1 \n\t"
+
+ "packushb $f24, $f24, $f26 \n\t"
+ "packushb $f26, $f28, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-112(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 640-80(%[tmp]) \n\t"
+ "or $f28, $f28, $f16 \n\t"
+ "or $f30, $f30, $f18 \n\t"
+ "and $f16, $f0, $f28 \n\t"
+ "and $f18, $f2, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
+ "pandn $f0, $f0, $f28 \n\t"
+ "pandn $f2, $f2, $f30 \n\t"
+ "or $f16, $f16, $f0 \n\t"
+ "or $f18, $f18, $f2 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+ "gslqc1 $f2, $f0, 0x0($12) \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "punpcklbh $f0, $f2, $f30 \n\t"
+ "punpckhbh $f2, $f2, $f30 \n\t"
+ "psllh $f0, $f0, $f28 \n\t"
+ "psllh $f2, $f2, $f28 \n\t"
+ "paddh $f0, $f0, $f8 \n\t"
+ "paddh $f2, $f2, $f10 \n\t"
+ "paddh $f0, $f0, $f8 \n\t"
+ "paddh $f2, $f2, $f10 \n\t"
+ "paddh $f0, $f0, $f8 \n\t"
+ "paddh $f2, $f2, $f10 \n\t"
+ "paddh $f0, $f0, $f20 \n\t"
+ "paddh $f2, $f2, $f22 \n\t"
+ "dli %[iAlpha], 0x3 \n\t"
+ "gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t"
+ "paddh $f0, $f0, $f28 \n\t"
+ "paddh $f2, $f2, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
+ "paddh $f0, $f0, $f28 \n\t"
+ "paddh $f2, $f2, $f30 \n\t"
+ "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
+ "paddh $f0, $f0, $f28 \n\t"
+ "paddh $f2, $f2, $f30 \n\t"
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "psrah $f0, $f0, $f28 \n\t"
+ "psrah $f2, $f2, $f28 \n\t"
+ "and $f0, $f0, $f12 \n\t"
+ "and $f2, $f2, $f14 \n\t"
+ "pandn $f12, $f12, $f8 \n\t"
+ "pandn $f14, $f14, $f10 \n\t"
+ "or $f0, $f0, $f12 \n\t"
+ "or $f2, $f2, $f14 \n\t"
+ "and $f28, $f4, $f0 \n\t"
+ "and $f30, $f6, $f2 \n\t"
+
+ "gslqc1 $f2, $f0, 656-272(%[tmp]) \n\t"
+ "gssqc1 $f2, $f0, 0x0($11) \n\t"
+
+ "gslqc1 $f2, $f0, 672-272(%[tmp]) \n\t"
+
+ "gssqc1 $f2, $f0, 0x0($8) \n\t"
+ "gslqc1 $f2, $f0, 688-272(%[tmp]) \n\t"
+ "gssqc1 $f2, $f0, 0x0($9) \n\t"
+ "gslqc1 $f2, $f0, 704-272(%[tmp]) \n\t"
+
+ "pandn $f4, $f4, $f8 \n\t"
+ "pandn $f6, $f6, $f10 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPix]) \n\t"
+ "or $f28, $f28, $f4 \n\t"
+ "or $f30, $f30, $f6 \n\t"
+ "packushb $f16, $f16, $f18 \n\t"
+ "packushb $f18, $f28, $f30 \n\t"
+ "gssqc1 $f26, $f24, 0x0($13) \n\t"
+ "gssqc1 $f18, $f16, 0x0(%[iStride]) \n\t"
+ : [pPix]"+&r"((unsigned char *)pPix)
+ : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
+ [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0",
+ "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+ "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockChromaLt4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
+ int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
+ unsigned char tmp[256] __attribute__((aligned(32)));
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "lb $8, 0x2(%[pTC]) \n\t"
+ "lb $9, 0x3(%[pTC]) \n\t"
+ "move $11, $8 \n\t"
+ "lb $8, 0x1(%[pTC]) \n\t"
+ "lb %[pTC], 0x0(%[pTC]) \n\t"
+ "move $12, %[pTC] \n\t"
+ "and %[pTC], $9, 0xFFFF \n\t"
+ "dmtc1 %[pTC], $f4 \n\t"
+ "and %[pTC], $9, 0xFFFF \n\t"
+ "dmtc1 %[pTC], $f8 \n\t"
+ "move %[pTC], $11 \n\t"
+ "and $9, %[pTC], 0xFFFF \n\t"
+ "and %[pTC], %[pTC], 0xFFFF \n\t"
+ "dmtc1 %[pTC], $f16 \n\t"
+ "and %[pTC], $8, 0xFFFF \n\t"
+ "dmtc1 %[pTC], $f20 \n\t"
+ "dmtc1 $9, $f12 \n\t"
+ "and %[pTC], $8, 0xFFFF \n\t"
+ "dmtc1 %[pTC], $f24 \n\t"
+ "move %[pTC], $12 \n\t"
+ "and $9, %[pTC], 0xFFFF \n\t"
+ "and %[pTC], %[pTC], 0xFFFF \n\t"
+ "punpcklhw $f24, $f24, $f8 \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+ "xor $f2, $f2, $f2 \n\t"
+ "gssqc1 $f2, $f0, 0x40(%[tmp]) \n\t"
+ "dmtc1 $9, $f28 \n\t"
+ "dmtc1 %[pTC], $f0 \n\t"
+ "daddu %[pTC], %[iStride], %[iStride] \n\t"
+ "dsubu $9, %[pPixCb], %[pTC] \n\t"
+ "punpcklhw $f20, $f20, $f4 \n\t"
+ "gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
+ "punpcklhw $f0, $f0, $f16 \n\t"
+ "gsldxc1 $f16, 0x0(%[iStride], %[pPixCr]) \n\t"
+ "punpcklhw $f28, $f28, $f12 \n\t"
+ "gsldxc1 $f12, 0x0(%[pPixCb], $0) \n\t"
+ "punpcklhw $f0, $f0, $f24 \n\t"
+ "gsldxc1 $f24, 0x0($9, $0) \n\t"
+ "punpcklhw $f28, $f28, $f20 \n\t"
+ "punpckhhw $f2, $f0, $f28 \n\t"
+ "punpcklhw $f0, $f0, $f28 \n\t"
+ "dsubu $9, %[pPixCr], %[pTC] \n\t"
+ "psubh $f8, $f4, $f0 \n\t"
+ "psubh $f10, $f6, $f2 \n\t"
+ "gssqc1 $f10, $f8, 0x60(%[tmp]) \n\t"
+ "gsldxc1 $f8, 0x0($9, $0) \n\t"
+ "mov.d $f26, $f8 \n\t"
+ "dsubu %[pTC], %[pPixCb], %[iStride] \n\t"
+ "gsldxc1 $f28, 0x0(%[pTC], $0) \n\t"
+ "dsubu $9, %[pPixCr], %[iStride] \n\t"
+ "gsldxc1 $f8, 0x0($9, $0) \n\t"
+ "mov.d $f30, $f8 \n\t"
+ "gsldxc1 $f8, 0x0(%[pPixCr], $0) \n\t"
+ "mov.d $f14, $f8 \n\t"
+ "gsldxc1 $f8, 0x0(%[iStride], %[pPixCb]) \n\t"
+ "mov.d $f10, $f16 \n\t"
+ "gssqc1 $f10, $f8, 0xE0(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f8 \n\t"
+ "punpcklhw $f16, $f8, $f8 \n\t"
+ "dmtc1 %[iBeta], $f8 \n\t"
+ "punpcklhw $f20, $f8, $f8 \n\t"
+ "punpcklwd $f8, $f20, $f20 \n\t"
+ "mov.d $f10, $f8 \n\t"
+ "gssqc1 $f10, $f8, 0x50(%[tmp]) \n\t"
+ "punpckhbh $f10, $f24, $f4 \n\t"
+ "punpcklbh $f8, $f24, $f4 \n\t"
+ "gssqc1 $f14, $f12, 0xd0(%[tmp]) \n\t"
+ "punpcklwd $f16, $f16, $f16 \n\t"
+ "mov.d $f18, $f16 \n\t"
+ "gssqc1 $f10, $f8, 0x30(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+ "gssqc1 $f26, $f24, 0x80(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0xd0(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+ "gssqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0xe0(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+ "gssqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 0xe0(%[tmp]) \n\t"
+ "mov.d $f8, $f28 \n\t"
+ "mov.d $f10, $f30 \n\t"
+ "punpcklbh $f28, $f30, $f6 \n\t"
+ "punpckhbh $f30, $f30, $f6 \n\t"
+ "punpckhbh $f22, $f20, $f4 \n\t"
+ "punpcklbh $f20, $f20, $f4 \n\t"
+ "gssqc1 $f30, $f28, 0xa0(%[tmp]) \n\t"
+ "punpckhbh $f14, $f12, $f4 \n\t"
+ "punpcklbh $f12, $f12, $f4 \n\t"
+ "dli %[iBeta], 0x4 \n\t"
+ "punpckhbh $f10, $f8, $f4 \n\t"
+ "punpcklbh $f8, $f8, $f4 \n\t"
+ "dmtc1 %[iBeta], $f24 \n\t"
+ "punpcklhw $f28, $f24, $f24 \n\t"
+ "punpcklwd $f24, $f28, $f28 \n\t"
+ "mov.d $f26, $f24 \n\t"
+ "gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
+ "psubh $f28, $f28, $f20 \n\t"
+ "psubh $f30, $f30, $f22 \n\t"
+ "pcmpgth $f24, $f0, $f4 \n\t"
+ "pcmpgth $f26, $f2, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x60(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 0x40(%[tmp]) \n\t"
+ "psubh $f24, $f12, $f8 \n\t"
+ "psubh $f26, $f14, $f10 \n\t"
+ "dmfc1 %[iAlpha], $f12 \n\t"
+ "dmfc1 %[iBeta], $f14 \n\t"
+ "dli $10, 0x2 \n\t"
+ "dmtc1 $10, $f12 \n\t"
+ "dli $10, 0x3 \n\t"
+ "dmtc1 $10, $f14 \n\t"
+ "psllh $f24, $f24, $f12 \n\t"
+ "psllh $f26, $f26, $f12 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x20(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "psrah $f24, $f24, $f14 \n\t"
+ "psrah $f26, $f26, $f14 \n\t"
+ "dmtc1 %[iAlpha], $f12 \n\t"
+ "dmtc1 %[iBeta], $f14 \n\t"
+ "pmaxsh $f4, $f4, $f24 \n\t"
+ "pmaxsh $f6, $f6, $f26 \n\t"
+ "gssqc1 $f2, $f0, 0x10(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x10(%[tmp]) \n\t"
+ "pminsh $f24, $f24, $f4 \n\t"
+ "pminsh $f26, $f26, $f6 \n\t"
+ "gssqc1 $f26, $f24, 0x10(%[tmp]) \n\t"
+ "psubh $f4, $f8, $f12 \n\t"
+ "psubh $f6, $f10, $f14 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f24, $f26)
+ "pcmpgth $f24, $f16, $f4 \n\t"
+ "pcmpgth $f26, $f18, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x30(%[tmp]) \n\t"
+ "psubh $f4, $f4, $f8 \n\t"
+ "psubh $f6, $f6, $f10 \n\t"
+ "dmfc1 %[iAlpha], $f8 \n\t"
+ "dmfc1 %[iBeta], $f10 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f8, $f10)
+ "pcmpgth $f28, $f28, $f4 \n\t"
+ "pcmpgth $f30, $f30, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x50(%[tmp]) \n\t"
+ "and $f24, $f24, $f28 \n\t"
+ "and $f26, $f26, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "psubh $f20, $f20, $f12 \n\t"
+ "psubh $f22, $f22, $f14 \n\t"
+ WELS_AbsH($f20, $f22, $f20, $f22, $f8, $f10)
+ "pcmpgth $f4, $f4, $f20 \n\t"
+ "pcmpgth $f6, $f6, $f22 \n\t"
+ "gslqc1 $f22, $f20, 0x80(%[tmp]) \n\t"
+ "gslqc1 $f10, $f8, 0x90(%[tmp]) \n\t"
+ "psubh $f20, $f20, $f8 \n\t"
+ "psubh $f22, $f22, $f10 \n\t"
+ "and $f24, $f24, $f4 \n\t"
+ "and $f26, $f26, $f6 \n\t"
+ "gslqc1 $f10, $f8, 0x40(%[tmp]) \n\t"
+ "and $f24, $f24, $f8 \n\t"
+ "and $f26, $f26, $f10 \n\t"
+ "gslqc1 $f6, $f4, 0x10(%[tmp]) \n\t"
+ "and $f4, $f4, $f24 \n\t"
+ "and $f6, $f6, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
+ "gssqc1 $f6, $f4, 0x30(%[tmp]) \n\t"
+ "gslqc1 $f6, $f4, 0xa0(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f4 \n\t"
+ "psubh $f26, $f26, $f6 \n\t"
+ "dli $10, 0x2 \n\t"
+ "dmtc1 $10, $f8 \n\t"
+ "psllh $f24, $f24, $f8 \n\t"
+ "psllh $f26, $f26, $f8 \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "dli $10, 0x3 \n\t"
+ "gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+ "dmtc1 $10, $f8 \n\t"
+ "gslqc1 $f22, $f20, 0x60(%[tmp]) \n\t"
+ "psrah $f24, $f24, $f8 \n\t"
+ "psrah $f26, $f26, $f8 \n\t"
+ "pmaxsh $f20, $f20, $f24 \n\t"
+ "pmaxsh $f22, $f22, $f26 \n\t"
+ "pminsh $f0, $f0, $f20 \n\t"
+ "pminsh $f2, $f2, $f22 \n\t"
+ "gslqc1 $f22, $f20, 0x70(%[tmp]) \n\t"
+ "psubh $f24, $f4, $f20 \n\t"
+ "psubh $f26, $f6, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
+ "pcmpgth $f16, $f16, $f24 \n\t"
+ "pcmpgth $f18, $f18, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x80(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f4 \n\t"
+ "psubh $f26, $f26, $f6 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
+ "pcmpgth $f28, $f28, $f24 \n\t"
+ "pcmpgth $f30, $f30, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
+ "and $f16, $f16, $f28 \n\t"
+ "and $f18, $f18, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f20 \n\t"
+ "psubh $f26, $f26, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
+ "dmtc1 %[iAlpha], $f8 \n\t"
+ "dmtc1 %[iBeta], $f10 \n\t"
+ "pcmpgth $f28, $f28, $f24 \n\t"
+ "pcmpgth $f30, $f30, $f26 \n\t"
+ "and $f16, $f16, $f28 \n\t"
+ "and $f18, $f18, $f30 \n\t"
+ "gslqc1 $f26, $f24, 0x40(%[tmp]) \n\t"
+ "and $f16, $f16, $f24 \n\t"
+ "and $f18, $f18, $f26 \n\t"
+ "and $f0, $f0, $f16 \n\t"
+ "and $f2, $f2, $f18 \n\t"
+ "gslqc1 $f18, $f16, 0x30(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f16 \n\t"
+ "paddh $f10, $f10, $f18 \n\t"
+ "paddh $f4, $f4, $f0 \n\t"
+ "paddh $f6, $f6, $f2 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f4, $f6 \n\t"
+ "gssdxc1 $f8, 0x0(%[pTC], $0) \n\t"
+ "psubh $f12, $f12, $f16 \n\t"
+ "psubh $f14, $f14, $f18 \n\t"
+ "psubh $f20, $f20, $f0 \n\t"
+ "psubh $f22, $f22, $f2 \n\t"
+ "packushb $f12, $f12, $f14 \n\t"
+ "packushb $f14, $f20, $f22 \n\t"
+ "gssdxc1 $f12, 0x0(%[pPixCb], $0) \n\t"
+ "gssdxc1 $f10, 0x0($9, $0) \n\t"
+ "gssdxc1 $f14, 0x0(%[pPixCr], $0) \n\t"
+ : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
+ : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
+ [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
+ "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
+ "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockChromaEq4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
+ int32_t iAlpha, int32_t iBeta) {
+ unsigned char tmp[128] __attribute__((aligned(32)));
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "daddu $8, %[iStride], %[iStride] \n\t"
+ "dsubu $9, %[pPixCb], $8 \n\t"
+ "gsldxc1 $f16, 0x0(%[pPixCr], $0) \n\t"
+ "gsldxc1 $f20, 0x0(%[iStride], %[pPixCr]) \n\t"
+ "gsldxc1 $f4, 0x0($9, $0) \n\t"
+ "dsubu $9, %[pPixCr], $8 \n\t"
+ "gsldxc1 $f8, 0x0($9, $0) \n\t"
+ "mov.d $f6, $f8 \n\t"
+ "dsubu $8, %[pPixCb], %[iStride] \n\t"
+ "gsldxc1 $f8, 0x0($8, $0) \n\t"
+ "dsubu $9, %[pPixCr], %[iStride] \n\t"
+ "gsldxc1 $f12, 0x0($9, $0) \n\t"
+ "mov.d $f10, $f12 \n\t"
+ "gsldxc1 $f12, 0x0(%[pPixCb], $0) \n\t"
+ "mov.d $f14, $f16 \n\t"
+ "gsldxc1 $f16, 0x0(%[iStride], %[pPixCb]) \n\t"
+ "mov.d $f18, $f20 \n\t"
+ "dmtc1 %[iAlpha], $f20 \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+ "xor $f2, $f2, $f2 \n\t"
+ "punpcklhw $f24, $f20, $f20 \n\t"
+ "punpcklwd $f20, $f24, $f24 \n\t"
+ "mov.d $f22, $f20 \n\t"
+ "dmtc1 %[iBeta], $f24 \n\t"
+ "punpcklhw $f28, $f24, $f24 \n\t"
+ "punpcklwd $f24, $f28, $f28 \n\t"
+ "mov.d $f26, $f24 \n\t"
+ "mov.d $f28, $f4 \n\t"
+ "punpcklbh $f4, $f6, $f2 \n\t"
+ "punpckhbh $f6, $f6, $f2 \n\t"
+ "punpckhbh $f30, $f28, $f0 \n\t"
+ "punpcklbh $f28, $f28, $f0 \n\t"
+ "gssqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
+ "gssqc1 $f30, $f28, 0x60(%[tmp]) \n\t"
+ "punpckhbh $f30, $f8, $f0 \n\t"
+ "punpcklbh $f28, $f8, $f0 \n\t"
+ "gssqc1 $f30, $f28, 0x10(%[tmp]) \n\t"
+ "punpckhbh $f30, $f12, $f0 \n\t"
+ "punpcklbh $f28, $f12, $f0 \n\t"
+ "punpcklbh $f12, $f14, $f2 \n\t"
+ "punpckhbh $f14, $f14, $f2 \n\t"
+ "gssqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "mov.d $f28, $f16 \n\t"
+ "punpcklbh $f16, $f18, $f2 \n\t"
+ "punpckhbh $f18, $f18, $f2 \n\t"
+ "punpcklbh $f8, $f10, $f2 \n\t"
+ "punpckhbh $f10, $f10, $f2 \n\t"
+ "punpckhbh $f30, $f28, $f0 \n\t"
+ "punpcklbh $f28, $f28, $f0 \n\t"
+ "gssqc1 $f14, $f12, 0x30(%[tmp]) \n\t"
+ "gslqc1 $f14, $f12, 0x10(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 0x50(%[tmp]) \n\t"
+ "psubh $f4, $f12, $f0 \n\t"
+ "psubh $f6, $f14, $f2 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
+ "gssqc1 $f18, $f16, 0x20(%[tmp]) \n\t"
+ "pcmpgth $f0, $f20, $f4 \n\t"
+ "pcmpgth $f2, $f22, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x60(%[tmp]) \n\t"
+ "psubh $f4, $f4, $f12 \n\t"
+ "psubh $f6, $f6, $f14 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
+ "pcmpgth $f16, $f24, $f4 \n\t"
+ "pcmpgth $f18, $f26, $f6 \n\t"
+ "and $f0, $f0, $f16 \n\t"
+ "and $f2, $f2, $f18 \n\t"
+ "gslqc1 $f18, $f16, 0x50(%[tmp]) \n\t"
+ "psubh $f4, $f28, $f16 \n\t"
+ "psubh $f6, $f30, $f18 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
+ "pcmpgth $f16, $f24, $f4 \n\t"
+ "pcmpgth $f18, $f26, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x30(%[tmp]) \n\t"
+ "psubh $f4, $f8, $f4 \n\t"
+ "psubh $f6, $f10, $f6 \n\t"
+ "dmfc1 %[iAlpha], $f28 \n\t"
+ "dmfc1 %[iBeta], $f30 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
+ "pcmpgth $f20, $f20, $f4 \n\t"
+ "pcmpgth $f22, $f22, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
+ "and $f0, $f0, $f16 \n\t"
+ "and $f2, $f2, $f18 \n\t"
+ "psubh $f4, $f4, $f8 \n\t"
+ "psubh $f6, $f6, $f10 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
+ "pcmpgth $f16, $f24, $f4 \n\t"
+ "pcmpgth $f18, $f26, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x20(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t"
+ "psubh $f4, $f4, $f28 \n\t"
+ "psubh $f6, $f6, $f30 \n\t"
+ "and $f20, $f20, $f16 \n\t"
+ "and $f22, $f22, $f18 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
+ "dmtc1 %[iAlpha], $f28 \n\t"
+ "dmtc1 %[iBeta], $f30 \n\t"
+ "pcmpgth $f24, $f24, $f4 \n\t"
+ "pcmpgth $f26, $f26, $f6 \n\t"
+ "and $f20, $f20, $f24 \n\t"
+ "and $f22, $f22, $f26 \n\t"
+ "dli %[iBeta], 0x2 \n\t"
+ "dmtc1 %[iBeta], $f4 \n\t"
+ "punpcklhw $f16, $f4, $f4 \n\t"
+ "punpcklwd $f4, $f16, $f16 \n\t"
+ "mov.d $f6, $f4 \n\t"
+ "gslqc1 $f18, $f16, 0x60(%[tmp]) \n\t"
+ "paddh $f24, $f16, $f16 \n\t"
+ "paddh $f26, $f18, $f18 \n\t"
+ "paddh $f24, $f24, $f12 \n\t"
+ "paddh $f26, $f26, $f14 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "gssqc1 $f6, $f4, 0x10(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 0x10(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f16 \n\t"
+ "paddh $f26, $f26, $f18 \n\t"
+ "dmtc1 %[iBeta], $f16 \n\t"
+ "psrah $f24, $f24, $f16 \n\t"
+ "psrah $f26, $f26, $f16 \n\t"
+ "pandn $f16, $f0, $f12 \n\t"
+ "pandn $f18, $f2, $f14 \n\t"
+ "gslqc1 $f14, $f12, 0x40(%[tmp]) \n\t"
+ "and $f4, $f0, $f24 \n\t"
+ "and $f6, $f2, $f26 \n\t"
+ "or $f4, $f4, $f16 \n\t"
+ "or $f6, $f6, $f18 \n\t"
+ "paddh $f24, $f12, $f12 \n\t"
+ "paddh $f26, $f14, $f14 \n\t"
+ "gslqc1 $f14, $f12, 0x10(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+ "gslqc1 $f18, $f16, 0x20(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f16 \n\t"
+ "paddh $f26, $f26, $f18 \n\t"
+ "dmtc1 %[iBeta], $f16 \n\t"
+ "paddh $f24, $f24, $f12 \n\t"
+ "paddh $f26, $f26, $f14 \n\t"
+ "psrah $f24, $f24, $f16 \n\t"
+ "psrah $f26, $f26, $f16 \n\t"
+ "and $f16, $f20, $f24 \n\t"
+ "and $f18, $f22, $f26 \n\t"
+ "pandn $f24, $f20, $f8 \n\t"
+ "pandn $f26, $f22, $f10 \n\t"
+ "or $f16, $f16, $f24 \n\t"
+ "or $f18, $f18, $f26 \n\t"
+ "packushb $f4, $f4, $f6 \n\t"
+ "packushb $f6, $f16, $f18 \n\t"
+ "gslqc1 $f18, $f16, 0x50(%[tmp]) \n\t"
+ "paddh $f24, $f28, $f28 \n\t"
+ "paddh $f26, $f30, $f30 \n\t"
+ "paddh $f24, $f24, $f16 \n\t"
+ "paddh $f26, $f26, $f18 \n\t"
+ "gslqc1 $f10, $f8, 0x60(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+ "dmtc1 %[iBeta], $f28 \n\t"
+ "paddh $f24, $f24, $f12 \n\t"
+ "paddh $f26, $f26, $f14 \n\t"
+ "psrah $f24, $f24, $f28 \n\t"
+ "psrah $f26, $f26, $f28 \n\t"
+ "and $f8, $f0, $f24 \n\t"
+ "and $f10, $f2, $f26 \n\t"
+ "pandn $f0, $f0, $f16 \n\t"
+ "pandn $f2, $f2, $f18 \n\t"
+ "or $f8, $f8, $f0 \n\t"
+ "or $f10, $f10, $f2 \n\t"
+ "gslqc1 $f2, $f0, 0x20(%[tmp]) \n\t"
+ "paddh $f24, $f0, $f0 \n\t"
+ "paddh $f26, $f2, $f2 \n\t"
+ "gslqc1 $f2, $f0, 0x30(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f0 \n\t"
+ "paddh $f26, $f26, $f2 \n\t"
+ "gslqc1 $f18, $f16, 0x40(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f16 \n\t"
+ "paddh $f26, $f26, $f18 \n\t"
+ "paddh $f24, $f24, $f12 \n\t"
+ "paddh $f26, $f26, $f14 \n\t"
+ "gssdxc1 $f4, 0x0($8, $0) \n\t"
+ "psrah $f24, $f24, $f28 \n\t"
+ "psrah $f26, $f26, $f28 \n\t"
+ "and $f16, $f20, $f24 \n\t"
+ "and $f18, $f22, $f26 \n\t"
+ "pandn $f20, $f20, $f0 \n\t"
+ "pandn $f22, $f22, $f2 \n\t"
+ "or $f16, $f16, $f20 \n\t"
+ "or $f18, $f18, $f22 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f16, $f18 \n\t"
+ "gssdxc1 $f8, 0x0(%[pPixCb], $0) \n\t"
+ "gssdxc1 $f6, 0x0($9, $0) \n\t"
+ "gssdxc1 $f10, 0x0(%[pPixCr], $0) \n\t"
+ : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
+ : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
+ [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
+ "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
+ "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockChromaEq4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
+ int32_t iAlpha, int32_t iBeta) {
+ unsigned char tmp[256] __attribute__((aligned(32)));
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "daddiu %[pPixCb], %[pPixCb], -0x2 \n\t"
+ "daddiu %[pPixCr], %[pPixCr], -0x2 \n\t"
+ "move $9, %[pPixCb] \n\t"
+ "move $10, %[pPixCr] \n\t"
+ "dsll $11, %[iStride], 0x2 \n\t"
+ "daddu %[pPixCb], %[pPixCb], $11 \n\t"
+ "daddu %[pPixCr], %[pPixCr], $11 \n\t"
+ "daddiu $11, %[tmp], 0x80 \n\t"
+ "gsldlc1 $f0, 0x7($9) \n\t"
+ "gsldrc1 $f0, 0x0($9) \n\t"
+ "daddu $12, $9, %[iStride] \n\t"
+ "gsldlc1 $f4, 0x7($12) \n\t"
+ "gsldrc1 $f4, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsldlc1 $f8, 0x7($12) \n\t"
+ "gsldrc1 $f8, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsldlc1 $f12, 0x7($12) \n\t"
+ "gsldlc1 $f16, 0x7($10) \n\t"
+ "gsldrc1 $f12, 0x0($12) \n\t"
+ "gsldrc1 $f16, 0x0($10) \n\t"
+ "daddu $12, $10, %[iStride] \n\t"
+ "gsldlc1 $f20, 0x7($12) \n\t"
+ "gsldrc1 $f20, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsldlc1 $f24, 0x7($12) \n\t"
+ "gsldrc1 $f24, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsldlc1 $f28, 0x7($12) \n\t"
+ "gsldrc1 $f28, 0x0($12) \n\t"
+ "punpcklwd $f0, $f0, $f16 \n\t"
+ "punpcklwd $f4, $f4, $f20 \n\t"
+ "punpcklwd $f8, $f8, $f24 \n\t"
+ "punpcklwd $f12, $f12, $f28 \n\t"
+ "gsldlc1 $f16, 0x7(%[pPixCb]) \n\t"
+ "gsldlc1 $f20, 0x7(%[pPixCr]) \n\t"
+ "gsldrc1 $f16, 0x0(%[pPixCb]) \n\t"
+ "gsldrc1 $f20, 0x0(%[pPixCr]) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f2, $f16 \n\t"
+ "daddu $12, %[pPixCb], %[iStride] \n\t"
+ "daddu $13, %[pPixCr], %[iStride] \n\t"
+ "gsldlc1 $f16, 0x7($12) \n\t"
+ "gsldlc1 $f20, 0x7($13) \n\t"
+ "gsldrc1 $f16, 0x0($12) \n\t"
+ "gsldrc1 $f20, 0x0($13) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f6, $f16 \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "daddu $13, $13, %[iStride] \n\t"
+ "gsldlc1 $f16, 0x7($12) \n\t"
+ "gsldlc1 $f20, 0x7($13) \n\t"
+ "gsldrc1 $f16, 0x0($12) \n\t"
+ "gsldrc1 $f20, 0x0($13) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f10, $f16 \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "daddu $13, $13, %[iStride] \n\t"
+ "gsldlc1 $f16, 0x7($12) \n\t"
+ "gsldlc1 $f20, 0x7($13) \n\t"
+ "gsldrc1 $f16, 0x0($12) \n\t"
+ "gsldrc1 $f20, 0x0($13) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f14, $f16 \n\t"
+ "punpcklbh $f24, $f2, $f6 \n\t"
+ "punpckhbh $f26, $f2, $f6 \n\t"
+ "punpckhbh $f2, $f0, $f4 \n\t"
+ "punpcklbh $f0, $f0, $f4 \n\t"
+ "punpcklbh $f28, $f10, $f14 \n\t"
+ "punpckhbh $f30, $f10, $f14 \n\t"
+ "punpckhbh $f10, $f8, $f12 \n\t"
+ "punpcklbh $f8, $f8, $f12 \n\t"
+ "punpcklhw $f16, $f2, $f10 \n\t"
+ "punpckhhw $f18, $f2, $f10 \n\t"
+ "punpckhhw $f2, $f0, $f8 \n\t"
+ "punpcklhw $f0, $f0, $f8 \n\t"
+ "punpcklhw $f20, $f26, $f30 \n\t"
+ "punpckhhw $f22, $f26, $f30 \n\t"
+ "punpckhhw $f26, $f24, $f28 \n\t"
+ "punpcklhw $f24, $f24, $f28 \n\t"
+ "punpcklwd $f4, $f2, $f26 \n\t"
+ "punpckhwd $f6, $f2, $f26 \n\t"
+ "punpckhwd $f2, $f0, $f24 \n\t"
+ "punpcklwd $f0, $f0, $f24 \n\t"
+ "punpcklwd $f8, $f18, $f22 \n\t"
+ "punpckhwd $f10, $f18, $f22 \n\t"
+ "punpckhwd $f18, $f16, $f20 \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f20, $f2 \n\t"
+ "mov.d $f22, $f18 \n\t"
+ "mov.d $f2, $f16 \n\t"
+ "mov.d $f24, $f6 \n\t"
+ "mov.d $f26, $f10 \n\t"
+ "mov.d $f6, $f8 \n\t"
+ "gssqc1 $f2, $f0, 0x0($11) \n\t"
+ "gssqc1 $f22, $f20, 0x10($11) \n\t"
+ "gssqc1 $f6, $f4, 0x20($11) \n\t"
+ "gssqc1 $f26, $f24, 0x30($11) \n\t"
+ "gslqc1 $f26, $f24, 0x80(%[tmp]) \n\t"
+ "gslqc1 $f18, $f16, 0x90(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 0xa0(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 0xb0(%[tmp]) \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+ "dmtc1 %[iAlpha], $f4 \n\t"
+ "punpcklhw $f8, $f4, $f4 \n\t"
+ "punpcklwd $f4, $f8, $f8 \n\t"
+ "mov.d $f6, $f4 \n\t"
+ "dmtc1 %[iBeta], $f8 \n\t"
+ "punpcklhw $f12, $f8, $f8 \n\t"
+ "punpcklwd $f8, $f12, $f12 \n\t"
+ "mov.d $f10, $f8 \n\t"
+ "mov.d $f12, $f24 \n\t"
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "gssqc1 $f26, $f24, 0x60(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "gssqc1 $f26, $f24, 0x30(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0xa0(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "gssqc1 $f26, $f24, 0x40(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0xb0(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f0 \n\t"
+ "punpckhbh $f26, $f26, $f0 \n\t"
+ "gssqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
+ "punpckhbh $f30, $f28, $f0 \n\t"
+ "punpcklbh $f28, $f28, $f0 \n\t"
+ "punpckhbh $f18, $f16, $f0 \n\t"
+ "punpcklbh $f16, $f16, $f0 \n\t"
+ "punpckhbh $f22, $f20, $f0 \n\t"
+ "punpcklbh $f20, $f20, $f0 \n\t"
+ "punpckhbh $f14, $f12, $f0 \n\t"
+ "punpcklbh $f12, $f12, $f0 \n\t"
+ "gssqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "psubh $f24, $f16, $f20 \n\t"
+ "psubh $f26, $f18, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
+ "pcmpgth $f0, $f4, $f24 \n\t"
+ "pcmpgth $f2, $f6, $f26 \n\t"
+ "psubh $f24, $f12, $f16 \n\t"
+ "psubh $f26, $f14, $f18 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+ "pcmpgth $f28, $f8, $f24 \n\t"
+ "pcmpgth $f30, $f10, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x50(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f20 \n\t"
+ "psubh $f26, $f26, $f22 \n\t"
+ "and $f0, $f0, $f28 \n\t"
+ "and $f2, $f2, $f30 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+ "dmfc1 %[iAlpha], $f20 \n\t"
+ "dmfc1 %[iBeta], $f22 \n\t"
+ "pcmpgth $f28, $f8, $f24 \n\t"
+ "pcmpgth $f30, $f10, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x30(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 0x40(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f20 \n\t"
+ "psubh $f26, $f26, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
+ "pcmpgth $f4, $f4, $f24 \n\t"
+ "pcmpgth $f6, $f6, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x60(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 0x30(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f20 \n\t"
+ "psubh $f26, $f26, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
+ "and $f0, $f0, $f28 \n\t"
+ "and $f2, $f2, $f30 \n\t"
+ "pcmpgth $f28, $f8, $f24 \n\t"
+ "pcmpgth $f30, $f10, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
+ "gslqc1 $f22, $f20, 0x40(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f20 \n\t"
+ "psubh $f26, $f26, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
+ "dli $8, 0x2 \n\t"
+ "and $f4, $f4, $f28 \n\t"
+ "and $f6, $f6, $f30 \n\t"
+ "pcmpgth $f8, $f8, $f24 \n\t"
+ "pcmpgth $f10, $f10, $f26 \n\t"
+ "and $f4, $f4, $f8 \n\t"
+ "and $f6, $f6, $f10 \n\t"
+ "dmtc1 $8, $f8 \n\t"
+ "punpcklhw $f24, $f8, $f8 \n\t"
+ "punpcklwd $f8, $f24, $f24 \n\t"
+ "mov.d $f10, $f8 \n\t"
+ "gssqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
+ "paddh $f8, $f12, $f12 \n\t"
+ "paddh $f10, $f14, $f14 \n\t"
+ "paddh $f8, $f8, $f16 \n\t"
+ "paddh $f10, $f10, $f18 \n\t"
+ "gslqc1 $f22, $f20, 0x50(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f20 \n\t"
+ "paddh $f10, $f10, $f22 \n\t"
+ "gslqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f24 \n\t"
+ "paddh $f10, $f10, $f26 \n\t"
+ "dmtc1 $8, $f20 \n\t"
+ "psrah $f8, $f8, $f20 \n\t"
+ "psrah $f10, $f10, $f20 \n\t"
+ "and $f24, $f0, $f8 \n\t"
+ "and $f26, $f2, $f10 \n\t"
+ "pandn $f8, $f0, $f16 \n\t"
+ "pandn $f10, $f2, $f18 \n\t"
+ "or $f24, $f24, $f8 \n\t"
+ "or $f26, $f26, $f10 \n\t"
+ "gslqc1 $f10, $f8, 0x60(%[tmp]) \n\t"
+ "paddh $f28, $f8, $f8 \n\t"
+ "paddh $f30, $f10, $f10 \n\t"
+ "gslqc1 $f22, $f20, 0x30(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f20 \n\t"
+ "paddh $f30, $f30, $f22 \n\t"
+ "gslqc1 $f18, $f16, 0x70(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f16 \n\t"
+ "paddh $f30, $f30, $f18 \n\t"
+ "gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
+ "paddh $f28, $f28, $f8 \n\t"
+ "paddh $f30, $f30, $f10 \n\t"
+ "pandn $f8, $f4, $f20 \n\t"
+ "pandn $f10, $f6, $f22 \n\t"
+ "dmtc1 $8, $f20 \n\t"
+ "psrah $f28, $f28, $f20 \n\t"
+ "psrah $f30, $f30, $f20 \n\t"
+ "and $f16, $f4, $f28 \n\t"
+ "and $f18, $f6, $f30 \n\t"
+ "or $f16, $f16, $f8 \n\t"
+ "or $f18, $f18, $f10 \n\t"
+ "gslqc1 $f10, $f8, 0x50(%[tmp]) \n\t"
+ "packushb $f24, $f24, $f26 \n\t"
+ "packushb $f26, $f16, $f18 \n\t"
+ "gssqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
+ "paddh $f24, $f8, $f8 \n\t"
+ "paddh $f26, $f10, $f10 \n\t"
+ "dmtc1 %[iAlpha], $f20 \n\t"
+ "dmtc1 %[iBeta], $f22 \n\t"
+ "gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "paddh $f24, $f24, $f12 \n\t"
+ "paddh $f26, $f26, $f14 \n\t"
+ "mov.d $f16, $f0 \n\t"
+ "mov.d $f18, $f2 \n\t"
+ "pandn $f0, $f0, $f20 \n\t"
+ "pandn $f2, $f2, $f22 \n\t"
+ "dmtc1 $8, $f20 \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+ "psrah $f24, $f24, $f20 \n\t"
+ "psrah $f26, $f26, $f20 \n\t"
+ "and $f16, $f16, $f24 \n\t"
+ "and $f18, $f18, $f26 \n\t"
+ "or $f16, $f16, $f0 \n\t"
+ "or $f18, $f18, $f2 \n\t"
+ "gslqc1 $f2, $f0, 0x70(%[tmp]) \n\t"
+ "paddh $f20, $f0, $f0 \n\t"
+ "paddh $f22, $f2, $f2 \n\t"
+ "gslqc1 $f2, $f0, 0x40(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+ "gslqc1 $f14, $f12, 0x60(%[tmp]) \n\t"
+ "paddh $f20, $f20, $f12 \n\t"
+ "paddh $f22, $f22, $f14 \n\t"
+ "paddh $f20, $f20, $f8 \n\t"
+ "paddh $f22, $f22, $f10 \n\t"
+ "dmtc1 $8, $f8 \n\t"
+ "psrah $f20, $f20, $f8 \n\t"
+ "psrah $f22, $f22, $f8 \n\t"
+ "and $f12, $f4, $f20 \n\t"
+ "and $f14, $f6, $f22 \n\t"
+ "pandn $f4, $f4, $f0 \n\t"
+ "pandn $f6, $f6, $f2 \n\t"
+ "or $f12, $f12, $f4 \n\t"
+ "or $f14, $f14, $f6 \n\t"
+ "packushb $f16, $f16, $f18 \n\t"
+ "packushb $f18, $f12, $f14 \n\t"
+ "gssqc1 $f18, $f16, 0xa0(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 0x0($11) \n\t"
+ "gslqc1 $f6, $f4, 0x10($11) \n\t"
+ "gslqc1 $f10, $f8, 0x20($11) \n\t"
+ "gslqc1 $f14, $f12, 0x30($11) \n\t"
+ "mov.d $f26, $f2 \n\t"
+ "punpckhbh $f2, $f0, $f4 \n\t"
+ "punpcklbh $f0, $f0, $f4 \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+ "mov.d $f30, $f10 \n\t"
+ "punpckhbh $f10, $f8, $f12 \n\t"
+ "punpcklbh $f8, $f8, $f12 \n\t"
+ "punpcklbh $f28, $f30, $f14 \n\t"
+ "punpckhbh $f30, $f30, $f14 \n\t"
+ "punpcklhw $f16, $f2, $f10 \n\t"
+ "punpckhhw $f18, $f2, $f10 \n\t"
+ "punpcklhw $f20, $f26, $f30 \n\t"
+ "punpckhhw $f22, $f26, $f30 \n\t"
+ "punpckhhw $f2, $f0, $f8 \n\t"
+ "punpcklhw $f0, $f0, $f8 \n\t"
+ "punpckhhw $f26, $f24, $f28 \n\t"
+ "punpcklhw $f24, $f24, $f28 \n\t"
+ "punpcklwd $f4, $f2, $f26 \n\t"
+ "punpckhwd $f6, $f2, $f26 \n\t"
+ "punpcklwd $f8, $f18, $f22 \n\t"
+ "punpckhwd $f10, $f18, $f22 \n\t"
+ "punpckhwd $f2, $f0, $f24 \n\t"
+ "punpcklwd $f0, $f0, $f24 \n\t"
+ "punpckhwd $f18, $f16, $f20 \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f20, $f2 \n\t"
+ "mov.d $f24, $f6 \n\t"
+ "mov.d $f2, $f16 \n\t"
+ "mov.d $f22, $f18 \n\t"
+ "mov.d $f6, $f8 \n\t"
+ "mov.d $f26, $f10 \n\t"
+ "dli %[iAlpha], 0x20 \n\t"
+ "dmtc1 %[iAlpha], $f8 \n\t"
+ "gsswlc1 $f0, 0x3($9) \n\t"
+ "gsswrc1 $f0, 0x0($9) \n\t"
+ "daddu $12, $9, %[iStride] \n\t"
+ "gsswlc1 $f20, 0x3($12) \n\t"
+ "gsswrc1 $f20, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsswlc1 $f4, 0x3($12) \n\t"
+ "gsswrc1 $f4, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsswlc1 $f24, 0x3($12) \n\t"
+ "gsswrc1 $f24, 0x0($12) \n\t"
+ "dsrl $f0, $f0, $f8 \n\t"
+ "dsrl $f20, $f20, $f8 \n\t"
+ "dsrl $f4, $f4, $f8 \n\t"
+ "dsrl $f24, $f24, $f8 \n\t"
+ "gsswlc1 $f0, 0x3($10) \n\t"
+ "gsswrc1 $f0, 0x0($10) \n\t"
+ "daddu $13, $10, %[iStride] \n\t"
+ "daddu $8, $13, %[iStride] \n\t"
+ "gsswlc1 $f20, 0x3($13) \n\t"
+ "gsswrc1 $f20, 0x0($13) \n\t"
+ "daddu $13, $8, %[iStride] \n\t"
+ "gsswlc1 $f4, 0x3($8) \n\t"
+ "gsswrc1 $f4, 0x0($8) \n\t"
+ "gsswlc1 $f24, 0x3($13) \n\t"
+ "gsswrc1 $f24, 0x0($13) \n\t"
+ "gsswlc1 $f2, 0x3(%[pPixCb]) \n\t"
+ "gsswrc1 $f2, 0x0(%[pPixCb]) \n\t"
+ "daddu $12, %[pPixCb], %[iStride] \n\t"
+ "gsswlc1 $f22, 0x3($12) \n\t"
+ "gsswrc1 $f22, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsswlc1 $f6, 0x3($12) \n\t"
+ "gsswrc1 $f6, 0x0($12) \n\t"
+ "daddu $12, $12, %[iStride] \n\t"
+ "gsswlc1 $f26, 0x3($12) \n\t"
+ "gsswrc1 $f26, 0x0($12) \n\t"
+ "dsrl $f2, $f2, $f8 \n\t"
+ "dsrl $f22, $f22, $f8 \n\t"
+ "dsrl $f6, $f6, $f8 \n\t"
+ "dsrl $f26, $f26, $f8 \n\t"
+ "gsswlc1 $f2, 0x3(%[pPixCr]) \n\t"
+ "gsswrc1 $f2, 0x0(%[pPixCr]) \n\t"
+ "daddu $13, %[pPixCr], %[iStride] \n\t"
+ "daddu $8, $13, %[iStride] \n\t"
+ "gsswlc1 $f22, 0x3($13) \n\t"
+ "gsswrc1 $f22, 0x0($13) \n\t"
+ "daddu $13, $8, %[iStride] \n\t"
+ "gsswlc1 $f6, 0x3($8) \n\t"
+ "gsswrc1 $f6, 0x0($8) \n\t"
+ "gsswlc1 $f26, 0x3($13) \n\t"
+ "gsswrc1 $f26, 0x0($13) \n\t"
+ : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
+ : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
+ [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
+ "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
+ "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void DeblockChromaLt4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
+ int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
+ unsigned char tmp[320] __attribute__((aligned(32)));
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "daddiu %[pPixCb], %[pPixCb], -0x2 \n\t"
+ "daddiu %[pPixCr], %[pPixCr], -0x2 \n\t"
+ "daddu $8, %[pPixCb], %[iStride] \n\t"
+ "gsldlc1 $f0, 0x7(%[pPixCb]) \n\t"
+ "gsldlc1 $f4, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pPixCb]) \n\t"
+ "gsldrc1 $f4, 0x0($8) \n\t"
+ "daddu $9, $8, %[iStride] \n\t"
+ "daddu $8, $9, %[iStride] \n\t"
+ "gsldlc1 $f8, 0x7($9) \n\t"
+ "gsldlc1 $f12, 0x7($8) \n\t"
+ "gsldrc1 $f8, 0x0($9) \n\t"
+ "gsldrc1 $f12, 0x0($8) \n\t"
+ "daddu $9, $8, %[iStride] \n\t"
+
+ "daddu $10, %[pPixCr], %[iStride] \n\t"
+ "gsldlc1 $f16, 0x7(%[pPixCr]) \n\t"
+ "gsldlc1 $f20, 0x7($10) \n\t"
+ "gsldrc1 $f16, 0x0(%[pPixCr]) \n\t"
+ "gsldrc1 $f20, 0x0($10) \n\t"
+ "daddu $11, $10, %[iStride] \n\t"
+ "daddu $10, $11, %[iStride] \n\t"
+ "gsldlc1 $f24, 0x7($11) \n\t"
+ "gsldlc1 $f28, 0x7($10) \n\t"
+ "gsldrc1 $f24, 0x0($11) \n\t"
+ "gsldrc1 $f28, 0x0($10) \n\t"
+ "daddu $11, $10, %[iStride] \n\t"
+
+ "punpcklwd $f0, $f0, $f16 \n\t"
+ "punpcklwd $f4, $f4, $f20 \n\t"
+ "punpcklwd $f8, $f8, $f24 \n\t"
+ "punpcklwd $f12, $f12, $f28 \n\t"
+ "gsldlc1 $f16, 0x7($9) \n\t"
+ "gsldlc1 $f20, 0x7($11) \n\t"
+ "gsldrc1 $f16, 0x0($9) \n\t"
+ "gsldrc1 $f20, 0x0($11) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f2, $f16 \n\t"
+ "daddu $8, $9, %[iStride] \n\t"
+ "daddu $10, $11, %[iStride] \n\t"
+ "gsldlc1 $f16, 0x7($8) \n\t"
+ "gsldlc1 $f20, 0x7($10) \n\t"
+ "gsldrc1 $f16, 0x0($8) \n\t"
+ "gsldrc1 $f20, 0x0($10) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f6, $f16 \n\t"
+ "daddu $9, $8, %[iStride] \n\t"
+ "daddu $11, $10, %[iStride] \n\t"
+
+ "gsldlc1 $f16, 0x7($9) \n\t"
+ "gsldlc1 $f20, 0x7($11) \n\t"
+ "gsldrc1 $f16, 0x0($9) \n\t"
+ "gsldrc1 $f20, 0x0($11) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f10, $f16 \n\t"
+ "daddu $8, $9, %[iStride] \n\t"
+ "daddu $10, $11, %[iStride] \n\t"
+
+ "gsldlc1 $f16, 0x7($8) \n\t"
+ "gsldlc1 $f20, 0x7($10) \n\t"
+ "gsldrc1 $f16, 0x0($8) \n\t"
+ "gsldrc1 $f20, 0x0($10) \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "mov.d $f14, $f16 \n\t"
+
+ "punpcklbh $f24, $f2, $f6 \n\t"
+ "punpckhbh $f26, $f2, $f6 \n\t"
+ "punpckhbh $f2, $f0, $f4 \n\t"
+ "punpcklbh $f0, $f0, $f4 \n\t"
+ "punpcklbh $f28, $f10, $f14 \n\t"
+ "punpckhbh $f30, $f10, $f14 \n\t"
+ "punpckhbh $f10, $f8, $f12 \n\t"
+ "punpcklbh $f8, $f8, $f12 \n\t"
+
+ "punpcklhw $f16, $f2, $f10 \n\t"
+ "punpckhhw $f18, $f2, $f10 \n\t"
+ "punpckhhw $f2, $f0, $f8 \n\t"
+ "punpcklhw $f0, $f0, $f8 \n\t"
+ "punpcklhw $f20, $f26, $f30 \n\t"
+ "punpckhhw $f22, $f26, $f30 \n\t"
+ "punpckhhw $f26, $f24, $f28 \n\t"
+ "punpcklhw $f24, $f24, $f28 \n\t"
+
+ "punpcklwd $f4, $f2, $f26 \n\t"
+ "punpckhwd $f6, $f2, $f26 \n\t"
+ "punpckhwd $f2, $f0, $f24 \n\t"
+ "punpcklwd $f0, $f0, $f24 \n\t"
+ "punpcklwd $f8, $f18, $f22 \n\t"
+ "punpckhwd $f10, $f18, $f22 \n\t"
+ "punpckhwd $f18, $f16, $f20 \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+
+ "mov.d $f20, $f2 \n\t"
+ "mov.d $f22, $f18 \n\t"
+ "mov.d $f2, $f16 \n\t"
+ "mov.d $f24, $f6 \n\t"
+ "mov.d $f26, $f10 \n\t"
+ "mov.d $f6, $f8 \n\t"
+ "daddiu $11, %[tmp], 0x70 \n\t"
+
+ "gssqc1 $f2, $f0, 0x0($11) \n\t"
+ "gssqc1 $f22, $f20, 0x10($11) \n\t"
+ "gssqc1 $f6, $f4, 0x20($11) \n\t"
+ "gssqc1 $f26, $f24, 0x30($11) \n\t"
+
+ "lb $8, 0x3(%[pTC]) \n\t"
+ "lb $9, 0x2(%[pTC]) \n\t"
+ "lb $10, 0x1(%[pTC]) \n\t"
+ "lb $11, 0x0(%[pTC]) \n\t"
+
+ "and $12, $8, 0xFFFF \n\t"
+ "dmtc1 $12, $f8 \n\t"
+
+ "and $9, $9, 0xFFFF \n\t"
+ "dmtc1 $9, $f12 \n\t"
+ "mov.d $f16, $f12 \n\t"
+
+ "and $9, $10, 0xFFFF \n\t"
+ "dmtc1 $9, $f20 \n\t"
+ "xor $f0, $f0, $f0 \n\t"
+ "mov.d $f24, $f20 \n\t"
+ "and $9, $11, 0xFFFF \n\t"
+ "punpcklhw $f24, $f24, $f8 \n\t"
+
+ "mov.d $f4, $f8 \n\t"
+ "dmtc1 $9, $f28 \n\t"
+ "mov.d $f0, $f28 \n\t"
+
+ "punpcklhw $f28, $f28, $f12 \n\t"
+ "punpcklhw $f20, $f20, $f4 \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "punpcklhw $f28, $f28, $f20 \n\t"
+ "gslqc1 $f22, $f20, 0xA0(%[tmp]) \n\t"
+ "punpcklhw $f0, $f0, $f16 \n\t"
+ "punpcklhw $f0, $f0, $f24 \n\t"
+
+ "gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
+ "punpckhhw $f2, $f0, $f28 \n\t"
+ "punpcklhw $f0, $f0, $f28 \n\t"
+ "gslqc1 $f30, $f28, 0x80(%[tmp]) \n\t"
+ "psubh $f8, $f4, $f0 \n\t"
+ "psubh $f10, $f6, $f2 \n\t"
+ "gssqc1 $f10, $f8, 0xD0(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f8 \n\t"
+ "punpcklhw $f12, $f8, $f8 \n\t"
+ "punpcklwd $f16, $f12, $f12 \n\t"
+ "mov.d $f18, $f16 \n\t"
+
+ "dmtc1 %[iBeta], $f8 \n\t"
+ "punpcklhw $f12, $f8, $f8 \n\t"
+ "punpcklwd $f8, $f12, $f12 \n\t"
+ "mov.d $f10, $f8 \n\t"
+
+ "gslqc1 $f14, $f12, 0x90(%[tmp]) \n\t"
+ "gssqc1 $f10, $f8, 0x50(%[tmp]) \n\t"
+ "punpckhbh $f10, $f24, $f4 \n\t"
+ "punpcklbh $f8, $f24, $f4 \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+
+ "gssqc1 $f10, $f8, 0x40(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 0xB0(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
+ "punpcklbh $f8, $f28, $f4 \n\t"
+ "punpckhbh $f10, $f28, $f4 \n\t"
+ "punpcklbh $f28, $f30, $f6 \n\t"
+ "punpckhbh $f30, $f30, $f6 \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+ "punpckhbh $f14, $f12, $f4 \n\t"
+ "punpcklbh $f12, $f12, $f4 \n\t"
+ "punpckhbh $f22, $f20, $f4 \n\t"
+ "punpcklbh $f20, $f20, $f4 \n\t"
+ "gssqc1 $f30, $f28, 0xF0(%[tmp]) \n\t"
+ "gssqc1 $f26, $f24, 0xC0(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0xA0(%[tmp]) \n\t"
+ "punpcklbh $f24, $f26, $f6 \n\t"
+ "punpckhbh $f26, $f26, $f6 \n\t"
+
+ "dli $13, 0x4 \n\t"
+ "gssqc1 $f26, $f24, 0xE0(%[tmp]) \n\t"
+ "dmtc1 $13, $f24 \n\t"
+ "punpcklhw $f28, $f24, $f24 \n\t"
+ "punpcklwd $f24, $f28, $f28 \n\t"
+ "mov.d $f26, $f24 \n\t"
+ "dli $12, 0x2 \n\t"
+ "dli $13, 0x3 \n\t"
+
+ "gssqc1 $f2, $f0, 0x20(%[tmp]) \n\t"
+ "dmfc1 %[iAlpha], $f0 \n\t"
+ "dmfc1 %[iBeta], $f2 \n\t"
+ "gssqc1 $f26, $f24, 0x30(%[tmp]) \n\t"
+ "gslqc1 $f30, $f28, 0x40(%[tmp]) \n\t"
+ "psubh $f28, $f28, $f20 \n\t"
+ "psubh $f30, $f30, $f22 \n\t"
+ "pcmpgth $f24, $f0, $f4 \n\t"
+ "pcmpgth $f26, $f2, $f6 \n\t"
+
+ "dmtc1 $12, $f0 \n\t"
+ "dmtc1 $13, $f2 \n\t"
+ "gssqc1 $f26, $f24, 0x60(%[tmp]) \n\t"
+ "gslqc1 $f6, $f4, 0xD0(%[tmp]) \n\t"
+ "psubh $f24, $f12, $f8 \n\t"
+ "psubh $f26, $f14, $f10 \n\t"
+ "psllh $f24, $f24, $f0 \n\t"
+ "psllh $f26, $f26, $f0 \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f28 \n\t"
+ "paddh $f26, $f26, $f30 \n\t"
+ "psrah $f24, $f24, $f2 \n\t"
+ "psrah $f26, $f26, $f2 \n\t"
+ "pmaxsh $f4, $f4, $f24 \n\t"
+ "pmaxsh $f6, $f6, $f26 \n\t"
+
+ "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "gslqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
+ "pminsh $f24, $f24, $f4 \n\t"
+ "pminsh $f26, $f26, $f6 \n\t"
+
+ "gssqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
+ "psubh $f4, $f8, $f12 \n\t"
+ "psubh $f6, $f10, $f14 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
+ "pcmpgth $f24, $f16, $f4 \n\t"
+ "pcmpgth $f26, $f18, $f6 \n\t"
+ "gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
+ "psubh $f4, $f4, $f8 \n\t"
+ "psubh $f6, $f6, $f10 \n\t"
+ WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
+ "pcmpgth $f28, $f28, $f4 \n\t"
+ "pcmpgth $f30, $f30, $f6 \n\t"
+
+ "gslqc1 $f6, $f4, 0x50(%[tmp]) \n\t"
+ "and $f24, $f24, $f28 \n\t"
+ "and $f26, $f26, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "psubh $f20, $f20, $f12 \n\t"
+ "psubh $f22, $f22, $f14 \n\t"
+ WELS_AbsH($f20, $f22, $f20, $f22, $f0, $f2)
+ "pcmpgth $f4, $f4, $f20 \n\t"
+ "pcmpgth $f6, $f6, $f22 \n\t"
+
+ "gslqc1 $f22, $f20, 0xB0(%[tmp]) \n\t"
+ "gslqc1 $f2, $f0, 0xE0(%[tmp]) \n\t"
+ "psubh $f20, $f20, $f0 \n\t"
+ "psubh $f22, $f22, $f2 \n\t"
+ "and $f24, $f24, $f4 \n\t"
+ "and $f26, $f26, $f6 \n\t"
+ "gslqc1 $f2, $f0, 0x60(%[tmp]) \n\t"
+ "and $f24, $f24, $f0 \n\t"
+ "and $f26, $f26, $f2 \n\t"
+
+ "gslqc1 $f6, $f4, 0x20(%[tmp]) \n\t"
+ "and $f4, $f4, $f24 \n\t"
+ "and $f6, $f6, $f26 \n\t"
+ "gslqc1 $f26, $f24, 0xC0(%[tmp]) \n\t"
+ "gssqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
+ "gslqc1 $f6, $f4, 0xF0(%[tmp]) \n\t"
+
+ "dmtc1 $12, $f0 \n\t"
+ "psubh $f24, $f24, $f4 \n\t"
+ "psubh $f26, $f26, $f6 \n\t"
+ "psllh $f24, $f24, $f0 \n\t"
+ "psllh $f26, $f26, $f0 \n\t"
+ "paddh $f24, $f24, $f20 \n\t"
+ "paddh $f26, $f26, $f22 \n\t"
+ "gslqc1 $f2, $f0, 0x30(%[tmp]) \n\t"
+ "paddh $f24, $f24, $f0 \n\t"
+ "paddh $f26, $f26, $f2 \n\t"
+ "dmtc1 %[iBeta], $f2 \n\t"
+
+ "dmtc1 $13, $f0 \n\t"
+ "gslqc1 $f22, $f20, 0xD0(%[tmp]) \n\t"
+ "psrah $f24, $f24, $f0 \n\t"
+ "psrah $f26, $f26, $f0 \n\t"
+ "dmtc1 %[iAlpha], $f0 \n\t"
+ "pmaxsh $f20, $f20, $f24 \n\t"
+ "pmaxsh $f22, $f22, $f26 \n\t"
+ "pminsh $f0, $f0, $f20 \n\t"
+ "pminsh $f2, $f2, $f22 \n\t"
+
+ "dmfc1 %[iAlpha], $f0 \n\t"
+ "dmfc1 %[iBeta], $f2 \n\t"
+ "gslqc1 $f22, $f20, 0xC0(%[tmp]) \n\t"
+ "psubh $f24, $f4, $f20 \n\t"
+ "psubh $f26, $f6, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
+ "pcmpgth $f16, $f16, $f24 \n\t"
+ "pcmpgth $f18, $f18, $f26 \n\t"
+
+ "gslqc1 $f26, $f24, 0xB0(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f4 \n\t"
+ "psubh $f26, $f26, $f6 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
+ "pcmpgth $f28, $f28, $f24 \n\t"
+ "pcmpgth $f30, $f30, $f26 \n\t"
+
+ "gslqc1 $f26, $f24, 0xE0(%[tmp]) \n\t"
+ "and $f16, $f16, $f28 \n\t"
+ "and $f18, $f18, $f30 \n\t"
+
+ "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
+ "psubh $f24, $f24, $f20 \n\t"
+ "psubh $f26, $f26, $f22 \n\t"
+ WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
+ "pcmpgth $f28, $f28, $f24 \n\t"
+ "pcmpgth $f30, $f30, $f26 \n\t"
+ "and $f16, $f16, $f28 \n\t"
+ "and $f18, $f18, $f30 \n\t"
+ "gslqc1 $f30, $f28, 0x60(%[tmp]) \n\t"
+ "dmtc1 %[iAlpha], $f0 \n\t"
+ "dmtc1 %[iBeta], $f2 \n\t"
+ "and $f16, $f16, $f28 \n\t"
+ "and $f18, $f18, $f30 \n\t"
+ "and $f0, $f0, $f16 \n\t"
+ "and $f2, $f2, $f18 \n\t"
+
+ "gslqc1 $f18, $f16, 0x40(%[tmp]) \n\t"
+ "paddh $f8, $f8, $f16 \n\t"
+ "paddh $f10, $f10, $f18 \n\t"
+ "paddh $f4, $f4, $f0 \n\t"
+ "paddh $f6, $f6, $f2 \n\t"
+ "psubh $f12, $f12, $f16 \n\t"
+ "psubh $f14, $f14, $f18 \n\t"
+ "psubh $f20, $f20, $f0 \n\t"
+ "psubh $f22, $f22, $f2 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f4, $f6 \n\t"
+ "packushb $f12, $f12, $f14 \n\t"
+ "packushb $f14, $f20, $f22 \n\t"
+
+ "gssqc1 $f10, $f8, 0x80(%[tmp]) \n\t"
+ "gssqc1 $f14, $f12, 0x90(%[tmp]) \n\t"
+ "daddiu $11, %[tmp], 0x70 \n\t"
+
+ "gslqc1 $f2, $f0, 0x0($11) \n\t"
+ "gslqc1 $f6, $f4, 0x10($11) \n\t"
+ "gslqc1 $f10, $f8, 0x20($11) \n\t"
+ "gslqc1 $f14, $f12, 0x30($11) \n\t"
+
+ "punpcklbh $f24, $f2, $f6 \n\t"
+ "punpckhbh $f26, $f2, $f6 \n\t"
+ "punpckhbh $f2, $f0, $f4 \n\t"
+ "punpcklbh $f0, $f0, $f4 \n\t"
+
+ "punpcklbh $f28, $f10, $f14 \n\t"
+ "punpckhbh $f30, $f10, $f14 \n\t"
+ "punpckhbh $f10, $f8, $f12 \n\t"
+ "punpcklbh $f8, $f8, $f12 \n\t"
+
+ "punpcklhw $f16, $f2, $f10 \n\t"
+ "punpckhhw $f18, $f2, $f10 \n\t"
+ "punpckhhw $f2, $f0, $f8 \n\t"
+ "punpcklhw $f0, $f0, $f8 \n\t"
+ "punpcklhw $f20, $f26, $f30 \n\t"
+ "punpckhhw $f22, $f26, $f30 \n\t"
+ "punpckhhw $f26, $f24, $f28 \n\t"
+ "punpcklhw $f24, $f24, $f28 \n\t"
+
+ "punpcklwd $f4, $f2, $f26 \n\t"
+ "punpckhwd $f6, $f2, $f26 \n\t"
+ "punpckhwd $f2, $f0, $f24 \n\t"
+ "punpcklwd $f0, $f0, $f24 \n\t"
+ "punpcklwd $f8, $f18, $f22 \n\t"
+ "punpckhwd $f10, $f18, $f22 \n\t"
+ "punpckhwd $f18, $f16, $f20 \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+
+ "mov.d $f20, $f2 \n\t"
+ "mov.d $f22, $f18 \n\t"
+ "mov.d $f2, $f16 \n\t"
+ "mov.d $f24, $f6 \n\t"
+ "mov.d $f26, $f10 \n\t"
+ "mov.d $f6, $f8 \n\t"
+
+ "dli %[iAlpha], 0x20 \n\t"
+ "daddu $8, %[pPixCb], %[iStride] \n\t"
+ "gsswlc1 $f0, 0x3(%[pPixCb]) \n\t"
+ "gsswlc1 $f20, 0x3($8) \n\t"
+ "gsswrc1 $f0, 0x0(%[pPixCb]) \n\t"
+ "gsswrc1 $f20, 0x0($8) \n\t"
+ "daddu $9, $8, %[iStride] \n\t"
+ "daddu $8, $9, %[iStride] \n\t"
+ "gsswlc1 $f4, 0x3($9) \n\t"
+ "gsswlc1 $f24, 0x3($8) \n\t"
+ "gsswrc1 $f4, 0x0($9) \n\t"
+ "gsswrc1 $f24, 0x0($8) \n\t"
+ "daddu $9, $8, %[iStride] \n\t"
+ "dmtc1 %[iAlpha], $f8 \n\t"
+
+ "dsrl $f0, $f0, $f8 \n\t"
+ "dsrl $f20, $f20, $f8 \n\t"
+ "dsrl $f4, $f4, $f8 \n\t"
+ "dsrl $f24, $f24, $f8 \n\t"
+ "daddu $10, %[pPixCr], %[iStride] \n\t"
+ "gsswlc1 $f0, 0x3(%[pPixCr]) \n\t"
+ "gsswlc1 $f20, 0x3($10) \n\t"
+ "gsswrc1 $f0, 0x0(%[pPixCr]) \n\t"
+ "gsswrc1 $f20, 0x0($10) \n\t"
+ "daddu $11, $10, %[iStride] \n\t"
+ "daddu $10, $11, %[iStride] \n\t"
+ "gsswlc1 $f4, 0x3($11) \n\t"
+ "gsswlc1 $f24, 0x3($10) \n\t"
+ "gsswrc1 $f4, 0x0($11) \n\t"
+ "gsswrc1 $f24, 0x0($10) \n\t"
+ "daddu $11, $10, %[iStride] \n\t"
+
+ "daddu $8, $9, %[iStride] \n\t"
+ "gsswlc1 $f2, 0x3($9) \n\t"
+ "gsswlc1 $f22, 0x3($8) \n\t"
+ "gsswrc1 $f2, 0x0($9) \n\t"
+ "gsswrc1 $f22, 0x0($8) \n\t"
+ "daddu $9, $8, %[iStride] \n\t"
+ "daddu $8, $9, %[iStride] \n\t"
+ "gsswlc1 $f6, 0x3($9) \n\t"
+ "gsswlc1 $f26, 0x3($8) \n\t"
+ "gsswrc1 $f6, 0x0($9) \n\t"
+ "gsswrc1 $f26, 0x0($8) \n\t"
+
+ "dsrl $f2, $f2, $f8 \n\t"
+ "dsrl $f22, $f22, $f8 \n\t"
+ "dsrl $f6, $f6, $f8 \n\t"
+ "dsrl $f26, $f26, $f8 \n\t"
+ "daddu $10, $11, %[iStride] \n\t"
+ "gsswlc1 $f2, 0x3($11) \n\t"
+ "gsswlc1 $f22, 0x3($10) \n\t"
+ "gsswrc1 $f2, 0x0($11) \n\t"
+ "gsswrc1 $f22, 0x0($10) \n\t"
+ "daddu $11, $10, %[iStride] \n\t"
+ "daddu $10, $11, %[iStride] \n\t"
+ "gsswlc1 $f6, 0x3($11) \n\t"
+ "gsswlc1 $f26, 0x3($10) \n\t"
+ "gsswrc1 $f6, 0x0($11) \n\t"
+ "gsswrc1 $f26, 0x0($10) \n\t"
+ : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
+ : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
+ [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp), [pTC]"r"((char *)pTC)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
+ "$f6", "$f8", "$f10", "$f12","$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
+ "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsNonZeroCount_mmi(int8_t *pNonZeroCount) {
+ __asm__ volatile(
+ ".set arch=loongson3a \n\t"
+ "gsldlc1 $f0, 0x7(%[pNonZeroCount]) \n\t"
+ "gsldlc1 $f2, 0xF(%[pNonZeroCount]) \n\t"
+ "gsldlc1 $f4, 0x17(%[pNonZeroCount]) \n\t"
+ "gsldrc1 $f4, 0x10(%[pNonZeroCount]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pNonZeroCount]) \n\t"
+ "gsldrc1 $f2, 0x8(%[pNonZeroCount]) \n\t"
+ "pcmpeqh $f8, $f8, $f8 \n\t"
+ "dli $8, 0xF \n\t"
+ "dmtc1 $8, $f6 \n\t"
+ "psrlh $f8, $f8, $f6 \n\t"
+ "packushb $f8, $f8, $f8 \n\t"
+
+ "pminub $f0, $f0, $f8 \n\t"
+ "pminub $f2, $f2, $f8 \n\t"
+ "pminub $f4, $f4, $f8 \n\t"
+ "gssdlc1 $f0, 0x7(%[pNonZeroCount]) \n\t"
+ "gssdlc1 $f2, 0xF(%[pNonZeroCount]) \n\t"
+ "gssdlc1 $f4, 0x17(%[pNonZeroCount]) \n\t"
+ "gssdrc1 $f0, 0x0(%[pNonZeroCount]) \n\t"
+ "gssdrc1 $f2, 0x8(%[pNonZeroCount]) \n\t"
+ "gssdrc1 $f4, 0x10(%[pNonZeroCount]) \n\t"
+ :
+ : [pNonZeroCount] "r"((unsigned char *)pNonZeroCount)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8"
+ );
+}
--- a/codec/common/mips64/deblock_mmi.c
+++ /dev/null
@@ -1,2826 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2009-2018, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file deblock_mmi.c
- *
- * \brief Loongson optimize
- *
- * \date 20/07/2018 Created
- *
- *************************************************************************************
- */
-#include <stdint.h>
-#include "asmdefs_mmi.h"
-
-void DeblockLumaLt4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
- int32_t iBeta, int8_t *pTC) {
- unsigned char tmp[512] __attribute__((aligned(32)));
- BACKUP_REG;
- __asm__ volatile (
- ".set arch=loongson3a \n\t"
- "dsll $8, %[iStride], 0x1 \n\t"
- "daddu $8, $8, %[iStride] \n\t"
- "dsubu $14, %[pPix], $8 \n\t"
-
- "dsll $8, %[iStride], 0x1 \n\t"
- "dsubu $9, %[pPix], $8 \n\t"
-
- "dmtc1 %[iAlpha], $f0 \n\t"
- "dsubu $13, %[pPix], %[iStride] \n\t"
- "daddu %[iStride], %[iStride], %[pPix] \n\t"
- "daddu $12, $8, %[pPix] \n\t"
-
- "punpcklhw $f0, $f0, $f0 \n\t"
- "lb $8, 0x0(%[pTC]) \n\t"
- "punpcklwd $f0, $f0, $f0 \n\t"
- "mov.d $f2, $f0 \n\t"
- "gssqc1 $f2, $f0, 432-112(%[tmp]) \n\t"
- "dmtc1 %[iBeta], $f0 \n\t"
- "lb %[iAlpha], 0x1(%[pTC]) \n\t"
- "dli %[iBeta], 0xFFFF \n\t"
- "punpcklhw $f0, $f0, $f0 \n\t"
- "and $10, %[iAlpha], %[iBeta] \n\t"
- "punpcklwd $f0, $f0, $f0 \n\t"
- "mov.d $f2, $f0 \n\t"
- "and %[iAlpha], %[iAlpha], %[iBeta] \n\t"
- "dmtc1 $10, $f4 \n\t"
- "mov.d $f8, $f4 \n\t"
- "dmtc1 %[iAlpha], $f16 \n\t"
- "and %[iAlpha], $8, %[iBeta] \n\t"
- "dmtc1 %[iAlpha], $f20 \n\t"
- "mov.d $f24, $f20 \n\t"
- "mov.d $f28, $f20 \n\t"
- "gssqc1 $f2, $f0, 432-336(%[tmp]) \n\t"
- "dmtc1 %[iAlpha], $f0 \n\t"
-
- "lb %[iAlpha], 0x3(%[pTC]) \n\t"
- "lb %[pTC], 0x2(%[pTC]) \n\t"
- "dmtc1 $10, $f12 \n\t"
- "punpcklhw $f0, $f0, $f16 \n\t"
- "and $8, %[iAlpha], %[iBeta] \n\t"
- "punpcklhw $f24, $f24, $f8 \n\t"
- "punpcklhw $f20, $f20, $f4 \n\t"
- "punpcklhw $f0, $f0, $f24 \n\t"
- "punpcklhw $f28, $f28, $f12 \n\t"
- "punpcklhw $f28, $f28, $f20 \n\t"
- "punpckhhw $f2, $f0, $f28 \n\t"
- "punpcklhw $f0, $f0, $f28 \n\t"
- "gssqc1 $f2, $f0, 432-400(%[tmp]) \n\t"
- "dmtc1 $8, $f0 \n\t"
- "and %[iAlpha], %[iAlpha], %[iBeta] \n\t"
- "mov.d $f8, $f0 \n\t"
- "dmtc1 %[iAlpha], $f16 \n\t"
- "and %[iAlpha], %[pTC], %[iBeta] \n\t"
- "dmtc1 $8, $f12 \n\t"
- "dmtc1 %[iAlpha], $f20 \n\t"
- "punpcklhw $f20, $f20, $f0 \n\t"
-
- "xor $f0, $f0, $f0 \n\t"
- "dmtc1 %[iAlpha], $f24 \n\t"
- "and %[pTC], %[pTC], %[iBeta] \n\t"
- "punpcklhw $f24, $f24, $f8 \n\t"
- "dmtc1 %[iAlpha], $f28 \n\t"
- "dmtc1 %[pTC], $f4 \n\t"
-
- "gslqc1 $f10, $f8, 0x0($9) \n\t"
- "punpckhbh $f10, $f8, $f0 \n\t"
- "punpcklbh $f8, $f8, $f0 \n\t"
-
- "dli %[iAlpha], 0x4 \n\t"
- "seh %[pTC], %[iAlpha] \n\t"
- "punpcklhw $f28, $f28, $f12 \n\t"
- "punpcklhw $f28, $f28, $f20 \n\t"
- "gslqc1 $f22, $f20, 0x0(%[iStride]) \n\t"
- "gslqc1 $f14, $f12, 0x0($13) \n\t"
- "gsldxc1 $f2, 0x0($12, $0) \n\t"
- "punpckhbh $f22, $f20, $f0 \n\t"
- "punpcklbh $f20, $f20, $f0 \n\t"
- "gssqc1 $f22, $f20, 432-240(%[tmp]) \n\t"
- "punpckhbh $f22, $f2, $f0 \n\t"
- "punpcklbh $f20, $f2, $f0 \n\t"
- "gssqc1 $f22, $f20, 432-352(%[tmp]) \n\t"
- "punpcklhw $f4, $f4, $f16 \n\t"
- "gslqc1 $f18, $f16, 0x0($14) \n\t"
- "punpcklhw $f4, $f4, $f24 \n\t"
- "gslqc1 $f26, $f24, 0x0(%[pPix]) \n\t"
- "punpckhhw $f6, $f4, $f28 \n\t"
- "punpcklhw $f4, $f4, $f28 \n\t"
- "punpckhbh $f26, $f24, $f0 \n\t"
- "punpcklbh $f24, $f24, $f0 \n\t"
- "punpckhbh $f14, $f12, $f0 \n\t"
- "punpcklbh $f12, $f12, $f0 \n\t"
- "punpckhbh $f18, $f16, $f0 \n\t"
- "punpcklbh $f16, $f16, $f0 \n\t"
- "psubh $f28, $f12, $f16 \n\t"
- "psubh $f30, $f14, $f18 \n\t"
- "gssqc1 $f18, $f16, 432-272(%[tmp]) \n\t"
- WELS_AbsH($f28, $f30, $f28, $f30, $f16, $f18)
- "gslqc1 $f18, $f16, 432-336(%[tmp]) \n\t"
- "gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t"
- "pcmpgth $f20, $f16, $f28 \n\t"
- "pcmpgth $f22, $f18, $f30 \n\t"
- "gssqc1 $f22, $f20, 432-288(%[tmp]) \n\t"
- "psubh $f28, $f24, $f0 \n\t"
- "psubh $f30, $f26, $f2 \n\t"
- WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
- "pcmpgth $f20, $f16, $f28 \n\t"
- "pcmpgth $f22, $f18, $f30 \n\t"
- "gssqc1 $f22, $f20, 432-256(%[tmp]) \n\t"
- "pavgh $f20, $f12, $f24 \n\t"
- "pavgh $f22, $f14, $f26 \n\t"
- "gssqc1 $f22, $f20, 432-304(%[tmp]) \n\t"
- "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
- "gslqc1 $f30, $f28, 432-288(%[tmp]) \n\t"
- "gslqc1 $f2, $f0, 432-256(%[tmp]) \n\t"
- "psubh $f20, $f20, $f28 \n\t"
- "psubh $f22, $f22, $f30 \n\t"
- "psubh $f20, $f20, $f0 \n\t"
- "psubh $f22, $f22, $f2 \n\t"
- "gssqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
- "gslqc1 $f2, $f0, 432-240(%[tmp]) \n\t"
- "psubh $f20, $f24, $f12 \n\t"
- "psubh $f22, $f26, $f14 \n\t"
- "gssqc1 $f26, $f24, 432-32(%[tmp]) \n\t"
- "psubh $f24, $f24, $f0 \n\t"
- "psubh $f26, $f26, $f2 \n\t"
- "gssqc1 $f22, $f20, 432-384(%[tmp]) \n\t"
- WELS_AbsH($f28, $f30, $f20, $f22, $f28, $f30)
- "gslqc1 $f22, $f20, 432-112(%[tmp]) \n\t"
- "pcmpgth $f20, $f20, $f28 \n\t"
- "pcmpgth $f22, $f22, $f30 \n\t"
- WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
- "pcmpgth $f28, $f16, $f24 \n\t"
- "pcmpgth $f30, $f18, $f26 \n\t"
-
- "xor $f0, $f0, $f0 \n\t"
- "and $f20, $f20, $f28 \n\t"
- "and $f22, $f22, $f30 \n\t"
- "psubh $f24, $f12, $f8 \n\t"
- "psubh $f26, $f14, $f10 \n\t"
- WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
- "pcmpgth $f28, $f16, $f24 \n\t"
- "pcmpgth $f30, $f18, $f26 \n\t"
- "gslqc1 $f26, $f24, 432-400(%[tmp]) \n\t"
- "and $f20, $f20, $f28 \n\t"
- "and $f22, $f22, $f30 \n\t"
- "pcmpgth $f28, $f24, $f0 \n\t"
- "pcmpgth $f30, $f26, $f0 \n\t"
- "pcmpeqh $f24, $f24, $f0 \n\t"
- "pcmpeqh $f26, $f26, $f0 \n\t"
- "or $f28, $f28, $f24 \n\t"
- "or $f30, $f30, $f26 \n\t"
- "and $f20, $f20, $f28 \n\t"
- "and $f22, $f22, $f30 \n\t"
- "gssqc1 $f22, $f20, 432-320(%[tmp]) \n\t"
- "dmtc1 %[pTC], $f20 \n\t"
- "punpckhhw $f26, $f20, $f20 \n\t"
- "punpcklhw $f24, $f20, $f20 \n\t"
- "punpcklwd $f20, $f24, $f24 \n\t"
- "mov.d $f22, $f20 \n\t"
- "gssqc1 $f22, $f20, 432-336(%[tmp]) \n\t"
- "gslqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
- "psubh $f24, $f0, $f20 \n\t"
- "dli $11, 0x2 \n\t"
- "psubh $f26, $f0, $f22 \n\t"
- "dmtc1 $11, $f28 \n\t"
- "gslqc1 $f22, $f20, 432-384(%[tmp]) \n\t"
- "gslqc1 $f2, $f0, 432-240(%[tmp]) \n\t"
- "psllh $f20, $f20, $f28 \n\t"
- "psllh $f22, $f22, $f28 \n\t"
- "psubh $f28, $f8, $f0 \n\t"
- "psubh $f30, $f10, $f2 \n\t"
- "paddh $f28, $f28, $f20 \n\t"
- "paddh $f30, $f30, $f22 \n\t"
- "gslqc1 $f22, $f20, 432-336(%[tmp]) \n\t"
- "paddh $f28, $f28, $f20 \n\t"
- "paddh $f30, $f30, $f22 \n\t"
- "dli $11, 0x3 \n\t"
- "dmtc1 $11, $f20 \n\t"
- "psrah $f28, $f28, $f20 \n\t"
- "psrah $f30, $f30, $f20 \n\t"
- "gslqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
- "pmaxsh $f24, $f24, $f28 \n\t"
- "pmaxsh $f26, $f26, $f30 \n\t"
- "gslqc1 $f2, $f0, 432-320(%[tmp]) \n\t"
- "pminsh $f20, $f20, $f24 \n\t"
- "pminsh $f22, $f22, $f26 \n\t"
-
- "and $f20, $f20, $f0 \n\t"
- "and $f22, $f22, $f2 \n\t"
- "gslqc1 $f26, $f24, 432-400(%[tmp]) \n\t"
- "gssqc1 $f22, $f20, 432-64(%[tmp]) \n\t"
- "xor $f0, $f0, $f0 \n\t"
- "gssqc1 $f26, $f24, 432-384(%[tmp]) \n\t"
- "psubh $f20, $f0, $f24 \n\t"
- "psubh $f22, $f0, $f26 \n\t"
- "gssqc1 $f22, $f20, 432-368(%[tmp]) \n\t"
- "mov.d $f24, $f20 \n\t"
- "mov.d $f26, $f22 \n\t"
- "gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
- "gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t"
- "paddh $f20, $f20, $f28 \n\t"
- "paddh $f22, $f22, $f30 \n\t"
- "paddh $f28, $f8, $f8 \n\t"
- "paddh $f30, $f10, $f10 \n\t"
- "psubh $f20, $f20, $f28 \n\t"
- "psubh $f22, $f22, $f30 \n\t"
- "dli $11, 0x1 \n\t"
- "dmtc1 $11, $f28 \n\t"
- "psrah $f20, $f20, $f28 \n\t"
- "psrah $f22, $f22, $f28 \n\t"
- "pmaxsh $f24, $f24, $f20 \n\t"
- "pmaxsh $f26, $f26, $f22 \n\t"
- "gslqc1 $f22, $f20, 432-384(%[tmp]) \n\t"
- "pminsh $f20, $f20, $f24 \n\t"
- "pminsh $f22, $f22, $f26 \n\t"
-
- "gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t"
- "gslqc1 $f30, $f28, 432-288(%[tmp]) \n\t"
- "and $f20, $f20, $f24 \n\t"
- "and $f22, $f22, $f26 \n\t"
- "and $f20, $f20, $f28 \n\t"
- "and $f22, $f22, $f30 \n\t"
- "gslqc1 $f26, $f24, 432-240(%[tmp]) \n\t"
- "gssqc1 $f22, $f20, 432-96(%[tmp]) \n\t"
- "gslqc1 $f22, $f20, 432-352(%[tmp]) \n\t"
- "gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t"
- "paddh $f20, $f20, $f28 \n\t"
- "paddh $f22, $f22, $f30 \n\t"
- "paddh $f28, $f24, $f24 \n\t"
- "paddh $f30, $f26, $f26 \n\t"
- "gslqc1 $f26, $f24, 432-368(%[tmp]) \n\t"
- "dli $11, 0x1 \n\t"
- "psubh $f20, $f20, $f28 \n\t"
- "dmtc1 $11, $f28 \n\t"
- "psubh $f22, $f22, $f30 \n\t"
-
- "psrah $f20, $f20, $f28 \n\t"
- "psrah $f22, $f22, $f28 \n\t"
- "gslqc1 $f30, $f28, 0x0(%[iStride]) \n\t"
- "pmaxsh $f24, $f24, $f20 \n\t"
- "pmaxsh $f26, $f26, $f22 \n\t"
- "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
- "pminsh $f20, $f20, $f24 \n\t"
- "pminsh $f22, $f22, $f26 \n\t"
- "gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t"
- "and $f20, $f20, $f24 \n\t"
- "and $f22, $f22, $f26 \n\t"
- "gslqc1 $f26, $f24, 432-256(%[tmp]) \n\t"
- "and $f20, $f20, $f24 \n\t"
- "and $f22, $f22, $f26 \n\t"
- "gslqc1 $f26, $f24, 0x0($9) \n\t"
- "punpcklbh $f28, $f30, $f0 \n\t"
- "punpckhbh $f30, $f30, $f0 \n\t"
- "gssqc1 $f30, $f28, 432-352(%[tmp]) \n\t"
-
- "gslqc1 $f30, $f28, 0x0($12) \n\t"
- "punpcklbh $f24, $f26, $f0 \n\t"
- "punpckhbh $f26, $f26, $f0 \n\t"
- "gssqc1 $f22, $f20, 432-48(%[tmp]) \n\t"
- "gslqc1 $f22, $f20, 0x0($14) \n\t"
- "gssqc1 $f26, $f24, 432-368(%[tmp]) \n\t"
- "gslqc1 $f26, $f24, 0x0($13) \n\t"
- "punpcklbh $f28, $f30, $f0 \n\t"
- "punpckhbh $f30, $f30, $f0 \n\t"
- "punpcklbh $f20, $f22, $f0 \n\t"
- "punpckhbh $f22, $f22, $f0 \n\t"
- "gssqc1 $f30, $f28, 432-384(%[tmp]) \n\t"
- "punpcklbh $f24, $f26, $f0 \n\t"
- "punpckhbh $f26, $f26, $f0 \n\t"
- "gssqc1 $f26, $f24, 432-400(%[tmp]) \n\t"
-
- "gslqc1 $f30, $f28, 432-400(%[tmp]) \n\t"
- "gslqc1 $f26, $f24, 0x0(%[pPix]) \n\t"
- "psubh $f28, $f28, $f20 \n\t"
- "psubh $f30, $f30, $f22 \n\t"
- "gssqc1 $f22, $f20, 432-16(%[tmp]) \n\t"
- WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
- "punpcklbh $f24, $f26, $f0 \n\t"
- "punpckhbh $f26, $f26, $f0 \n\t"
- "pcmpgth $f20, $f16, $f28 \n\t"
- "pcmpgth $f22, $f18, $f30 \n\t"
- "gslqc1 $f30, $f28, 432-384(%[tmp]) \n\t"
- "gssqc1 $f22, $f20, 432-288(%[tmp]) \n\t"
-
- "psubh $f28, $f24, $f28 \n\t"
- "psubh $f30, $f26, $f30 \n\t"
- WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
- "pcmpgth $f20, $f16, $f28 \n\t"
- "pcmpgth $f22, $f18, $f30 \n\t"
- "gssqc1 $f22, $f20, 432-256(%[tmp]) \n\t"
-
- "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
- "gssqc1 $f26, $f24, 432-80(%[tmp]) \n\t"
- "pavgh $f20, $f20, $f24 \n\t"
- "pavgh $f22, $f22, $f26 \n\t"
- "gssqc1 $f22, $f20, 432-304(%[tmp]) \n\t"
-
- "gslqc1 $f22, $f20, 432-288(%[tmp]) \n\t"
- "gslqc1 $f30, $f28, 432-256(%[tmp]) \n\t"
- "psubh $f20, $f4, $f20 \n\t"
- "psubh $f22, $f6, $f22 \n\t"
- "psubh $f20, $f20, $f28 \n\t"
- "psubh $f22, $f22, $f30 \n\t"
- "gssqc1 $f22, $f20, 432-224(%[tmp]) \n\t"
- "gslqc1 $f22, $f20, 432-400(%[tmp]) \n\t"
- "gslqc1 $f30, $f28, 432-352(%[tmp]) \n\t"
- "psubh $f20, $f24, $f20 \n\t"
- "psubh $f22, $f26, $f22 \n\t"
- "psubh $f24, $f24, $f28 \n\t"
- "psubh $f26, $f26, $f30 \n\t"
- "gssqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
- "mov.d $f28, $f20 \n\t"
- "mov.d $f30, $f22 \n\t"
- WELS_AbsH($f28, $f30, $f20, $f22, $f0, $f2)
- "gslqc1 $f22, $f20, 432-112(%[tmp]) \n\t"
- "pcmpgth $f20, $f20, $f28 \n\t"
- "pcmpgth $f22, $f22, $f30 \n\t"
- WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
- "pcmpgth $f28, $f16, $f24 \n\t"
- "pcmpgth $f30, $f18, $f26 \n\t"
- "gslqc1 $f26, $f24, 432-368(%[tmp]) \n\t"
-
- "and $f20, $f20, $f28 \n\t"
- "and $f22, $f22, $f30 \n\t"
- "gslqc1 $f30, $f28, 432-400(%[tmp]) \n\t"
- "psubh $f28, $f28, $f24 \n\t"
- "psubh $f30, $f30, $f26 \n\t"
- "gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t"
- "psubh $f24, $f24, $f0 \n\t"
- "psubh $f26, $f26, $f2 \n\t"
- WELS_AbsH($f28, $f30, $f28, $f30, $f0, $f2)
- "pcmpgth $f16, $f16, $f28 \n\t"
- "pcmpgth $f18, $f18, $f30 \n\t"
- "gslqc1 $f30, $f28, 432-96(%[tmp]) \n\t"
- "and $f20, $f20, $f16 \n\t"
- "and $f22, $f22, $f18 \n\t"
- "xor $f0, $f0, $f0 \n\t"
-
- "paddh $f8, $f8, $f28 \n\t"
- "paddh $f10, $f10, $f30 \n\t"
- "pcmpgth $f16, $f4, $f0 \n\t"
- "pcmpgth $f18, $f6, $f0 \n\t"
- "pcmpeqh $f28, $f4, $f0 \n\t"
- "pcmpeqh $f30, $f6, $f0 \n\t"
- "or $f16, $f16, $f28 \n\t"
- "or $f18, $f18, $f30 \n\t"
- "and $f20, $f20, $f16 \n\t"
- "and $f22, $f22, $f18 \n\t"
- "gslqc1 $f18, $f16, 432-224(%[tmp]) \n\t"
- "gssqc1 $f22, $f20, 432-320(%[tmp]) \n\t"
- "gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
- "dli $11, 0x2 \n\t"
- "psubh $f28, $f0, $f16 \n\t"
- "psubh $f30, $f0, $f18 \n\t"
- "psubh $f2, $f0, $f6 \n\t"
- "psubh $f0, $f0, $f4 \n\t"
- "dmfc1 %[iAlpha], $f28 \n\t"
- "dmtc1 $11, $f28 \n\t"
- "psllh $f20, $f20, $f28 \n\t"
- "psllh $f22, $f22, $f28 \n\t"
- "dmtc1 %[iAlpha], $f28 \n\t"
- "paddh $f24, $f24, $f20 \n\t"
- "paddh $f26, $f26, $f22 \n\t"
- "gslqc1 $f22, $f20, 432-336(%[tmp]) \n\t"
- "paddh $f24, $f24, $f20 \n\t"
- "paddh $f26, $f26, $f22 \n\t"
- "gslqc1 $f22, $f20, 432-368(%[tmp]) \n\t"
- "dli $11, 0x3 \n\t"
- "gssqc1 $f2, $f0, 432-336(%[tmp]) \n\t"
- "dmfc1 %[iAlpha], $f0 \n\t"
- "dmtc1 $11, $f0 \n\t"
- "psrah $f24, $f24, $f0 \n\t"
- "psrah $f26, $f26, $f0 \n\t"
- "dmtc1 %[iAlpha], $f0 \n\t"
- "pmaxsh $f28, $f28, $f24 \n\t"
- "pmaxsh $f30, $f30, $f26 \n\t"
- "pminsh $f16, $f16, $f28 \n\t"
- "pminsh $f18, $f18, $f30 \n\t"
- "gslqc1 $f30, $f28, 432-320(%[tmp]) \n\t"
- "and $f16, $f16, $f28 \n\t"
- "and $f18, $f18, $f30 \n\t"
- "mov.d $f24, $f0 \n\t"
- "mov.d $f26, $f2 \n\t"
- "gslqc1 $f2, $f0, 432-16(%[tmp]) \n\t"
- "gslqc1 $f30, $f28, 432-304(%[tmp]) \n\t"
- "paddh $f0, $f0, $f28 \n\t"
- "paddh $f2, $f2, $f30 \n\t"
- "gssqc1 $f18, $f16, 432-272(%[tmp]) \n\t"
- "gslqc1 $f18, $f16, 432-368(%[tmp]) \n\t"
- "dli $11, 0x1 \n\t"
- "paddh $f16, $f16, $f16 \n\t"
- "paddh $f18, $f18, $f18 \n\t"
- "psubh $f0, $f0, $f16 \n\t"
- "psubh $f2, $f2, $f18 \n\t"
-
- "dmtc1 $11, $f28 \n\t"
- "gslqc1 $f18, $f16, 432-64(%[tmp]) \n\t"
- "psrah $f0, $f0, $f28 \n\t"
- "psrah $f2, $f2, $f28 \n\t"
- "pmaxsh $f24, $f24, $f0 \n\t"
- "pmaxsh $f26, $f26, $f2 \n\t"
- "gslqc1 $f2, $f0, 432-400(%[tmp]) \n\t"
- "pminsh $f28, $f4, $f24 \n\t"
- "pminsh $f30, $f6, $f26 \n\t"
- "gslqc1 $f26, $f24, 432-320(%[tmp]) \n\t"
- "and $f28, $f28, $f24 \n\t"
- "and $f30, $f30, $f26 \n\t"
- "dmfc1 %[iAlpha], $f24 \n\t"
- "dmfc1 %[iBeta], $f26 \n\t"
- "gslqc1 $f26, $f24, 432-288(%[tmp]) \n\t"
- "and $f28, $f28, $f24 \n\t"
- "and $f30, $f30, $f26 \n\t"
- "paddh $f20, $f20, $f28 \n\t"
- "paddh $f22, $f22, $f30 \n\t"
- "packushb $f8, $f8, $f10 \n\t"
- "packushb $f10, $f20, $f22 \n\t"
- "gslqc1 $f22, $f20, 432-272(%[tmp]) \n\t"
- "paddh $f0, $f0, $f20 \n\t"
- "paddh $f2, $f2, $f22 \n\t"
- "paddh $f12, $f12, $f16 \n\t"
- "paddh $f14, $f14, $f18 \n\t"
- "packushb $f12, $f12, $f14 \n\t"
- "packushb $f14, $f0, $f2 \n\t"
-
- "gslqc1 $f2, $f0, 432-32(%[tmp]) \n\t"
- "psubh $f0, $f0, $f16 \n\t"
- "psubh $f2, $f2, $f18 \n\t"
- "gslqc1 $f18, $f16, 432-80(%[tmp]) \n\t"
- "psubh $f16, $f16, $f20 \n\t"
- "gslqc1 $f26, $f24, 432-48(%[tmp]) \n\t"
- "psubh $f18, $f18, $f22 \n\t"
-
- "gslqc1 $f22, $f20, 432-240(%[tmp]) \n\t"
- "paddh $f20, $f20, $f24 \n\t"
- "paddh $f22, $f22, $f26 \n\t"
- "gslqc1 $f26, $f24, 432-304(%[tmp]) \n\t"
- "packushb $f0, $f0, $f2 \n\t"
- "packushb $f2, $f16, $f18 \n\t"
- "gslqc1 $f18, $f16, 432-384(%[tmp]) \n\t"
- "paddh $f16, $f16, $f24 \n\t"
- "paddh $f18, $f18, $f26 \n\t"
- "gssqc1 $f2, $f0, 480-208(%[tmp]) \n\t"
- "gslqc1 $f2, $f0, 432-352(%[tmp]) \n\t"
- "mov.d $f28, $f0 \n\t"
- "mov.d $f30, $f2 \n\t"
- "paddh $f0, $f0, $f0 \n\t"
- "paddh $f2, $f2, $f2 \n\t"
-
- "dmtc1 %[iAlpha], $f24 \n\t"
- "dmtc1 %[iBeta], $f26 \n\t"
-
- "psubh $f16, $f16, $f0 \n\t"
- "psubh $f18, $f18, $f2 \n\t"
- "dli $11, 0x1 \n\t"
- "gslqc1 $f2, $f0, 432-336(%[tmp]) \n\t"
- "gssqc1 $f10, $f8, 0x0($9) \n\t"
- "dmtc1 $11, $f8 \n\t"
- "psrah $f16, $f16, $f8 \n\t"
- "psrah $f18, $f18, $f8 \n\t"
- "pmaxsh $f0, $f0, $f16 \n\t"
- "pmaxsh $f2, $f2, $f18 \n\t"
- "pminsh $f4, $f4, $f0 \n\t"
- "pminsh $f6, $f6, $f2 \n\t"
- "gslqc1 $f2, $f0, 480-208(%[tmp]) \n\t"
-
- "gslqc1 $f10, $f8, 428-256+4(%[tmp]) \n\t"
- "and $f4, $f4, $f24 \n\t"
- "and $f6, $f6, $f26 \n\t"
- "and $f4, $f4, $f8 \n\t"
- "and $f6, $f6, $f10 \n\t"
- "gssqc1 $f14, $f12, 0x0($13) \n\t"
- "paddh $f28, $f28, $f4 \n\t"
- "paddh $f30, $f30, $f6 \n\t"
- "packushb $f20, $f20, $f22 \n\t"
- "packushb $f22, $f28, $f30 \n\t"
- "gssqc1 $f2, $f0, 0x0(%[pPix]) \n\t"
- "gssqc1 $f22, $f20, 0x0(%[iStride]) \n\t"
- : [pPix]"+&r"((unsigned char *)pPix)
- : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
- [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
- : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
- "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
- "$f22", "$f24", "$f26", "$f28", "$f30"
- );
- RECOVER_REG;
-}
-
-void DeblockLumaTransposeH2V_mmi(uint8_t *pPixY, int32_t iStride,
- uint8_t *pDst) {
- BACKUP_REG;
- __asm__ volatile(
- ".set arch=loongson3a \n\t"
- "dsll $8, %[iStride], 0x3 \n\t"
- "daddu $8, $8, %[pPixY] \n\t"
-
- "daddu $9, %[pPixY], %[iStride] \n\t"
- "daddu $10, $8, %[iStride] \n\t"
- "gsldlc1 $f0, 0x7(%[pPixY]) \n\t"
- "gsldlc1 $f2, 0x7($8) \n\t"
- "gsldlc1 $f4, 0x7($9) \n\t"
- "gsldlc1 $f6, 0x7($10) \n\t"
- "gsldrc1 $f0, 0x0(%[pPixY]) \n\t"
- "gsldrc1 $f2, 0x0($8) \n\t"
- "gsldrc1 $f4, 0x0($9) \n\t"
- "gsldrc1 $f6, 0x0($10) \n\t"
- "daddu %[pPixY], $9, %[iStride] \n\t"
- "daddu $8, $10, %[iStride] \n\t"
- "daddu $9, %[pPixY], %[iStride] \n\t"
- "daddu $10, $8, %[iStride] \n\t"
- "gsldlc1 $f8, 0x7(%[pPixY]) \n\t"
- "gsldlc1 $f10, 0x7($8) \n\t"
- "gsldlc1 $f12, 0x7($9) \n\t"
- "gsldlc1 $f14, 0x7($10) \n\t"
- "gsldrc1 $f8, 0x0(%[pPixY]) \n\t"
- "gsldrc1 $f10, 0x0($8) \n\t"
- "gsldrc1 $f12, 0x0($9) \n\t"
- "gsldrc1 $f14, 0x0($10) \n\t"
-
- "daddu %[pPixY], $9, %[iStride] \n\t"
- "daddu $8, $10, %[iStride] \n\t"
- "daddu $9, %[pPixY], %[iStride] \n\t"
- "daddu $10, $8, %[iStride] \n\t"
- "gsldlc1 $f16, 0x7(%[pPixY]) \n\t"
- "gsldlc1 $f18, 0x7($8) \n\t"
- "gsldlc1 $f20, 0x7($9) \n\t"
- "gsldlc1 $f22, 0x7($10) \n\t"
- "gsldrc1 $f16, 0x0(%[pPixY]) \n\t"
- "gsldrc1 $f18, 0x0($8) \n\t"
- "gsldrc1 $f20, 0x0($9) \n\t"
- "gsldrc1 $f22, 0x0($10) \n\t"
- "daddu %[pPixY], $9, %[iStride] \n\t"
- "daddu $8, $10, %[iStride] \n\t"
- "daddu $9, %[pPixY], %[iStride] \n\t"
- "daddu $10, $8, %[iStride] \n\t"
- "gsldlc1 $f24, 0x7(%[pPixY]) \n\t"
- "gsldlc1 $f26, 0x7($8) \n\t"
-
- "gsldlc1 $f28, 0x7($9) \n\t"
- "gsldlc1 $f30, 0x7($10) \n\t"
- "gsldrc1 $f24, 0x0(%[pPixY]) \n\t"
- "gsldrc1 $f26, 0x0($8) \n\t"
- "gsldrc1 $f28, 0x0($9) \n\t"
- "gsldrc1 $f30, 0x0($10) \n\t"
-
- MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
- $f14, $f16, $f18, $f20, $f22, $f24,
- $f26, $f28, $f30, $9, $10)
-
- "gssqc1 $f18, $f16, 0x0(%[pDst]) \n\t"
- "gssqc1 $f10, $f8, 0x10(%[pDst]) \n\t"
- "gssqc1 $f14, $f12, 0x20(%[pDst]) \n\t"
- "gssqc1 $f30, $f28, 0x30(%[pDst]) \n\t"
- "gssqc1 $f22, $f20, 0x40(%[pDst]) \n\t"
- "gssqc1 $f6, $f4, 0x50(%[pDst]) \n\t"
- "gssqc1 $f26, $f24, 0x60(%[pDst]) \n\t"
- "gssqc1 $f2, $f0, 0x70(%[pDst]) \n\t"
- : [pPixY] "+&r"((unsigned char *)pPixY)
- : [iStride] "r"((int)iStride), [pDst] "r"((unsigned char *)pDst)
- : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
- "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
- "$f30"
- );
- RECOVER_REG;
-}
-
-void DeblockLumaTransposeV2H_mmi(uint8_t *pPixY, int32_t iStride,
- uint8_t *pSrc) {
- BACKUP_REG;
- __asm__ volatile(
- ".set arch=loongson3a \n\t"
- "gslqc1 $f2, $f0, 0x0(%[pSrc]) \n\t"
- "gslqc1 $f6, $f4, 0x10(%[pSrc]) \n\t"
- "gslqc1 $f10, $f8, 0x20(%[pSrc]) \n\t"
- "gslqc1 $f14, $f12, 0x30(%[pSrc]) \n\t"
- "gslqc1 $f18, $f16, 0x40(%[pSrc]) \n\t"
- "gslqc1 $f22, $f20, 0x50(%[pSrc]) \n\t"
- "gslqc1 $f26, $f24, 0x60(%[pSrc]) \n\t"
- "gslqc1 $f30, $f28, 0x70(%[pSrc]) \n\t"
-
- MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
- $f14, $f16, $f18, $f20, $f22, $f24,
- $f26, $f28, $f30, $9, $10)
-
- "daddu $8, %[pPixY], %[iStride] \n\t"
- "gssdlc1 $f16, 0x7(%[pPixY]) \n\t"
- "gssdlc1 $f8, 0x7($8) \n\t"
- "gssdrc1 $f16, 0x0(%[pPixY]) \n\t"
- "gssdrc1 $f8, 0x0($8) \n\t"
- "daddu %[pPixY], $8, %[iStride] \n\t"
- "daddu $8, %[pPixY], %[iStride] \n\t"
- "gssdlc1 $f12, 0x7(%[pPixY]) \n\t"
- "gssdlc1 $f28, 0x7($8) \n\t"
- "gssdrc1 $f12, 0x0(%[pPixY]) \n\t"
- "gssdrc1 $f28, 0x0($8) \n\t"
-
- "daddu %[pPixY], $8, %[iStride] \n\t"
- "daddu $8, %[pPixY], %[iStride] \n\t"
- "gssdlc1 $f20, 0x7(%[pPixY]) \n\t"
- "gssdlc1 $f4, 0x7($8) \n\t"
- "gssdrc1 $f20, 0x0(%[pPixY]) \n\t"
- "gssdrc1 $f4, 0x0($8) \n\t"
- "daddu %[pPixY], $8, %[iStride] \n\t"
- "daddu $8, %[pPixY], %[iStride] \n\t"
- "gssdlc1 $f24, 0x7(%[pPixY]) \n\t"
- "gssdlc1 $f0, 0x7($8) \n\t"
- "gssdrc1 $f24, 0x0(%[pPixY]) \n\t"
- "gssdrc1 $f0, 0x0($8) \n\t"
-
- "daddu %[pPixY], $8, %[iStride] \n\t"
- "daddu $8, %[pPixY], %[iStride] \n\t"
- "gssdlc1 $f18, 0x7(%[pPixY]) \n\t"
- "gssdlc1 $f10, 0x7($8) \n\t"
- "gssdrc1 $f18, 0x0(%[pPixY]) \n\t"
- "gssdrc1 $f10, 0x0($8) \n\t"
- "daddu %[pPixY], $8, %[iStride] \n\t"
- "daddu $8, %[pPixY], %[iStride] \n\t"
- "gssdlc1 $f14, 0x7(%[pPixY]) \n\t"
- "gssdlc1 $f30, 0x7($8) \n\t"
- "gssdrc1 $f14, 0x0(%[pPixY]) \n\t"
- "gssdrc1 $f30, 0x0($8) \n\t"
-
- "daddu %[pPixY], $8, %[iStride] \n\t"
- "daddu $8, %[pPixY], %[iStride] \n\t"
- "gssdlc1 $f22, 0x7(%[pPixY]) \n\t"
- "gssdlc1 $f6, 0x7($8) \n\t"
- "gssdrc1 $f22, 0x0(%[pPixY]) \n\t"
- "gssdrc1 $f6, 0x0($8) \n\t"
- "daddu %[pPixY], $8, %[iStride] \n\t"
- "daddu $8, %[pPixY], %[iStride] \n\t"
- "gssdlc1 $f26, 0x7(%[pPixY]) \n\t"
- "gssdlc1 $f2, 0x7($8) \n\t"
- "gssdrc1 $f26, 0x0(%[pPixY]) \n\t"
- "gssdrc1 $f2, 0x0($8) \n\t"
- : [pPixY] "+&r"((unsigned char *)pPixY)
- : [iStride] "r"((int)iStride), [pSrc] "r"((unsigned char *)pSrc)
- : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
- "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
- "$f30"
- );
- RECOVER_REG;
-}
-
-void DeblockLumaEq4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
- int32_t iBeta) {
- unsigned char tmp[720] __attribute__((aligned(32)));
- BACKUP_REG;
- __asm__ volatile (
- ".set arch=loongson3a \n\t"
- "dsll $11, %[iStride], 0x2 \n\t"
- "xor $f8, $f8, $f8 \n\t"
- "daddu $14, %[iStride], %[pPix] \n\t"
- "dsubu $8, %[pPix], $11 \n\t"
- "gslqc1 $f14, $f12, 0x0($8) \n\t"
- "gslqc1 $f22, $f20, 0x0(%[pPix]) \n\t"
- "daddu $9, %[iStride], %[iStride] \n\t"
- "daddu $10, $9, %[iStride] \n\t"
- "move $12, $9 \n\t"
- "dsubu $8, %[pPix], $9 \n\t"
- "gslqc1 $f6, $f4, 0x0($8) \n\t"
- "dsubu $9, %[pPix], %[iStride] \n\t"
- "gslqc1 $f18, $f16, 0x0($9) \n\t"
- "daddu $13, %[iStride], %[pPix] \n\t"
-
- "move %[iStride], $12 \n\t"
- "daddu $15, $12, %[pPix] \n\t"
-
- "daddu $12, %[pPix], $10 \n\t"
- "dsubu $11, %[pPix], $10 \n\t"
-
- "gslqc1 $f26, $f24, 0x0($11) \n\t"
- "daddu %[iStride], %[iStride], %[pPix] \n\t"
- "dmtc1 %[iAlpha], $f0 \n\t"
-
- "punpcklhw $f28, $f0, $f0 \n\t"
- "punpcklwd $f0, $f28, $f28 \n\t"
- "mov.d $f2, $f0 \n\t"
- "gssqc1 $f2, $f0, 640-320(%[tmp]) \n\t"
- "dmtc1 %[iBeta], $f0 \n\t"
- "gsldxc1 $f10, 0x0($15, $0) \n\t"
- "punpcklhw $f28, $f0, $f0 \n\t"
- "punpcklwd $f0, $f28, $f28 \n\t"
- "punpckhbh $f30, $f10, $f8 \n\t"
- "mov.d $f2, $f0 \n\t"
-
- "punpcklbh $f28, $f10, $f8 \n\t"
- "gssqc1 $f2, $f0, 640-512(%[tmp]) \n\t"
- "gssqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
- "mov.d $f0, $f4 \n\t"
- "gssqc1 $f22, $f20, 704-272(%[tmp]) \n\t"
- "gssqc1 $f6, $f4, 672-272(%[tmp]) \n\t"
- "mov.d $f4, $f16 \n\t"
- "punpckhbh $f22, $f20, $f8 \n\t"
- "punpcklbh $f20, $f20, $f8 \n\t"
- "punpckhbh $f6, $f4, $f8 \n\t"
- "punpcklbh $f4, $f4, $f8 \n\t"
-
- "psubh $f28, $f20, $f4 \n\t"
- "psubh $f30, $f22, $f6 \n\t"
- WELS_AbsH($f28, $f30, $f28, $f30, $f2, $f10)
- "gssqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
- "punpckhbh $f2, $f0, $f8 \n\t"
- "punpcklbh $f0, $f0, $f8 \n\t"
- "gssqc1 $f18, $f16, 688-272(%[tmp]) \n\t"
- "gslqc1 $f18, $f16, 0x0($14) \n\t"
- "gssqc1 $f2, $f0, 640-480(%[tmp]) \n\t"
-
- "psubh $f28, $f4, $f0 \n\t"
- "psubh $f30, $f6, $f2 \n\t"
-
- "gslqc1 $f2, $f0, 640-512(%[tmp]) \n\t"
- WELS_AbsH($f28, $f30, $f28, $f30, $f18, $f10)
- "punpckhbh $f18, $f16, $f8 \n\t"
- "punpcklbh $f16, $f16, $f8 \n\t"
- "pcmpgth $f0, $f0, $f28 \n\t"
- "pcmpgth $f2, $f2, $f30 \n\t"
- "gssqc1 $f18, $f16, 640-384(%[tmp]) \n\t"
- "psubh $f28, $f20, $f16 \n\t"
- "psubh $f30, $f22, $f18 \n\t"
- "gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t"
- "gssqc1 $f26, $f24, 656-272(%[tmp]) \n\t"
- "punpckhbh $f26, $f24, $f8 \n\t"
- "punpcklbh $f24, $f24, $f8 \n\t"
- WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
- "gssqc1 $f26, $f24, 640-368(%[tmp]) \n\t"
- "gssqc1 $f6, $f4, 640-144(%[tmp]) \n\t"
- "gssqc1 $f22, $f20, 640-400(%[tmp]) \n\t"
- "pcmpgth $f16, $f16, $f28 \n\t"
- "pcmpgth $f18, $f18, $f30 \n\t"
- "and $f0, $f0, $f16 \n\t"
- "and $f2, $f2, $f18 \n\t"
- "gslqc1 $f18, $f16, 640-320(%[tmp]) \n\t"
- "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
- "dli %[iAlpha], 0x2 \n\t"
- "dli %[iBeta], 0x2 \n\t"
- "pcmpgth $f16, $f16, $f28 \n\t"
- "pcmpgth $f18, $f18, $f30 \n\t"
- "and $f0, $f0, $f16 \n\t"
- "and $f2, $f2, $f18 \n\t"
- "dmtc1 %[iAlpha], $f16 \n\t"
- "dmtc1 %[iBeta], $f10 \n\t"
- "gssqc1 $f2, $f0, 736-272(%[tmp]) \n\t"
- "gslqc1 $f2, $f0, 640-320(%[tmp]) \n\t"
-
- "punpcklhw $f28, $f16, $f16 \n\t"
- "psrah $f16, $f0, $f10 \n\t"
- "psrah $f18, $f2, $f10 \n\t"
- "punpcklwd $f28, $f28, $f28 \n\t"
- "mov.d $f30, $f28 \n\t"
- "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
- "paddh $f16, $f16, $f28 \n\t"
- "paddh $f18, $f18, $f30 \n\t"
- "gssqc1 $f18, $f16, 640-576(%[tmp]) \n\t"
- "pcmpgth $f16, $f16, $f8 \n\t"
- "pcmpgth $f18, $f18, $f10 \n\t"
- "gssqc1 $f18, $f16, 640-560(%[tmp]) \n\t"
-
- "gssqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
- "gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t"
- "psubh $f28, $f4, $f24 \n\t"
- "psubh $f30, $f6, $f26 \n\t"
- WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
- "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
- "pcmpgth $f16, $f16, $f28 \n\t"
- "pcmpgth $f18, $f18, $f30 \n\t"
-
- "gslqc1 $f2, $f0, 640-416(%[tmp]) \n\t"
- "and $f16, $f16, $f8 \n\t"
- "and $f18, $f18, $f10 \n\t"
- "gssqc1 $f18, $f16, 640-544(%[tmp]) \n\t"
- "gslqc1 $f18, $f16, 640-512(%[tmp]) \n\t"
- "psubh $f28, $f20, $f0 \n\t"
- "psubh $f30, $f22, $f2 \n\t"
- WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
- "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
- "pcmpgth $f16, $f16, $f28 \n\t"
- "pcmpgth $f18, $f18, $f30 \n\t"
-
- "and $f16, $f16, $f8 \n\t"
- "and $f18, $f18, $f10 \n\t"
- "gssqc1 $f18, $f16, 640-560(%[tmp]) \n\t"
-
- "gslqc1 $f18, $f16, 640-544(%[tmp]) \n\t"
- "xor $f8, $f8, $f8 \n\t"
- "pandn $f16, $f16, $f24 \n\t"
- "dli %[iAlpha], 0x4 \n\t"
- "pandn $f18, $f18, $f26 \n\t"
- "gssqc1 $f18, $f16, 640-16(%[tmp]) \n\t"
- "dmtc1 %[iAlpha], $f16 \n\t"
- "punpcklhw $f28, $f16, $f16 \n\t"
- "dli %[iAlpha], 0x1 \n\t"
- "punpckhbh $f18, $f12, $f8 \n\t"
- "dmtc1 %[iAlpha], $f30 \n\t"
- "punpcklbh $f16, $f12, $f8 \n\t"
- "psllh $f16, $f16, $f30 \n\t"
- "psllh $f18, $f18, $f30 \n\t"
- "paddh $f16, $f16, $f24 \n\t"
- "paddh $f18, $f18, $f26 \n\t"
- "gslqc1 $f2, $f0, 640-480(%[tmp]) \n\t"
- "paddh $f16, $f16, $f24 \n\t"
- "paddh $f18, $f18, $f26 \n\t"
- "paddh $f16, $f16, $f24 \n\t"
- "paddh $f18, $f18, $f26 \n\t"
- "paddh $f16, $f16, $f0 \n\t"
- "paddh $f18, $f18, $f2 \n\t"
-
- "gslqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
- "punpcklwd $f28, $f28, $f28 \n\t"
- "mov.d $f30, $f28 \n\t"
- "paddh $f16, $f16, $f4 \n\t"
- "paddh $f18, $f18, $f6 \n\t"
- "gssqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
- "paddh $f16, $f16, $f20 \n\t"
- "paddh $f18, $f18, $f22 \n\t"
- "paddh $f16, $f16, $f28 \n\t"
- "paddh $f18, $f18, $f30 \n\t"
- "gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
- "gslqc1 $f2, $f0, 640-384(%[tmp]) \n\t"
- "pandn $f24, $f24, $f28 \n\t"
- "pandn $f26, $f26, $f30 \n\t"
- "gssqc1 $f26, $f24, 640-80(%[tmp]) \n\t"
- "gslqc1 $f26, $f24, 0x0($12) \n\t"
- "dmtc1 %[iAlpha], $f10 \n\t"
- "punpckhbh $f26, $f24, $f8 \n\t"
- "punpcklbh $f24, $f24, $f8 \n\t"
- "psllh $f24, $f24, $f10 \n\t"
- "psllh $f26, $f26, $f10 \n\t"
- "paddh $f24, $f24, $f28 \n\t"
- "paddh $f26, $f26, $f30 \n\t"
- "paddh $f24, $f24, $f28 \n\t"
- "paddh $f26, $f26, $f30 \n\t"
- "paddh $f24, $f24, $f28 \n\t"
- "paddh $f26, $f26, $f30 \n\t"
- "paddh $f24, $f24, $f0 \n\t"
- "paddh $f26, $f26, $f2 \n\t"
-
- "dli %[iAlpha], 0x3 \n\t"
- "gslqc1 $f30, $f28, 640-480(%[tmp]) \n\t"
- "gslqc1 $f2, $f0, 640-592(%[tmp]) \n\t"
- "paddh $f24, $f24, $f20 \n\t"
- "paddh $f26, $f26, $f22 \n\t"
- "paddh $f24, $f24, $f4 \n\t"
- "paddh $f26, $f26, $f6 \n\t"
- "paddh $f24, $f24, $f0 \n\t"
- "paddh $f26, $f26, $f2 \n\t"
- "gslqc1 $f2, $f0, 640-560(%[tmp]) \n\t"
- "dmtc1 %[iAlpha], $f10 \n\t"
- "psrah $f24, $f24, $f10 \n\t"
- "psrah $f26, $f26, $f10 \n\t"
- "and $f24, $f24, $f0 \n\t"
- "and $f26, $f26, $f2 \n\t"
- "gssqc1 $f26, $f24, 640-112(%[tmp]) \n\t"
- "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
- "pandn $f24, $f24, $f28 \n\t"
- "pandn $f26, $f26, $f30 \n\t"
- "gssqc1 $f26, $f24, 640-336(%[tmp]) \n\t"
- "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
- "gssqc1 $f26, $f24, 640-528(%[tmp]) \n\t"
- "gslqc1 $f26, $f24, 640-368(%[tmp]) \n\t"
- "gslqc1 $f2, $f0, 640-544(%[tmp]) \n\t"
- "dmtc1 %[iAlpha], $f10 \n\t"
- "paddh $f24, $f24, $f28 \n\t"
- "paddh $f26, $f26, $f30 \n\t"
- "psrah $f16, $f16, $f10 \n\t"
- "psrah $f18, $f18, $f10 \n\t"
- "and $f16, $f16, $f0 \n\t"
- "and $f18, $f18, $f2 \n\t"
- "gslqc1 $f2, $f0, 640-624(%[tmp]) \n\t"
- "paddh $f28, $f4, $f20 \n\t"
- "paddh $f30, $f6, $f22 \n\t"
- "paddh $f24, $f24, $f28 \n\t"
- "paddh $f26, $f26, $f30 \n\t"
- "paddh $f24, $f24, $f0 \n\t"
- "paddh $f26, $f26, $f2 \n\t"
- "gslqc1 $f30, $f28, 640-528(%[tmp]) \n\t"
- "dli %[iAlpha], 0x2 \n\t"
-
- "dmtc1 %[iAlpha], $f10 \n\t"
- "paddh $f20, $f20, $f4 \n\t"
- "paddh $f22, $f22, $f6 \n\t"
- "psrah $f24, $f24, $f10 \n\t"
- "psrah $f26, $f26, $f10 \n\t"
- "and $f28, $f28, $f24 \n\t"
- "and $f30, $f30, $f26 \n\t"
-
- "gslqc1 $f26, $f24, 640-384(%[tmp]) \n\t"
- "gssqc1 $f30, $f28, 640-64(%[tmp]) \n\t"
- "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
- "pandn $f28, $f28, $f24 \n\t"
- "pandn $f30, $f30, $f26 \n\t"
- "gssqc1 $f30, $f28, 640-304(%[tmp]) \n\t"
- "gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
- "gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t"
- "paddh $f28, $f28, $f24 \n\t"
- "paddh $f30, $f30, $f26 \n\t"
- "paddh $f28, $f28, $f20 \n\t"
- "paddh $f30, $f30, $f22 \n\t"
- "paddh $f28, $f28, $f8 \n\t"
- "paddh $f30, $f30, $f10 \n\t"
- "dmtc1 %[iAlpha], $f10 \n\t"
- "gslqc1 $f22, $f20, 640-560(%[tmp]) \n\t"
- "psrah $f28, $f28, $f10 \n\t"
- "psrah $f30, $f30, $f10 \n\t"
- "and $f20, $f20, $f28 \n\t"
- "and $f22, $f22, $f30 \n\t"
- "gssqc1 $f22, $f20, 640-32(%[tmp]) \n\t"
-
- "gslqc1 $f22, $f20, 640-480(%[tmp]) \n\t"
- "gslqc1 $f2, $f0, 640-592(%[tmp]) \n\t"
- "gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t"
- "paddh $f28, $f20, $f20 \n\t"
- "paddh $f30, $f22, $f22 \n\t"
- "paddh $f20, $f4, $f24 \n\t"
- "paddh $f22, $f6, $f26 \n\t"
- "paddh $f24, $f24, $f0 \n\t"
- "paddh $f26, $f26, $f2 \n\t"
- "paddh $f28, $f28, $f20 \n\t"
- "paddh $f30, $f30, $f22 \n\t"
- "paddh $f28, $f28, $f8 \n\t"
- "paddh $f30, $f30, $f10 \n\t"
- "dmtc1 %[iAlpha], $f10 \n\t"
- "gslqc1 $f22, $f20, 640-544(%[tmp]) \n\t"
- "psrah $f28, $f28, $f10 \n\t"
- "psrah $f30, $f30, $f10 \n\t"
- "dli %[iAlpha], 0x1 \n\t"
- "pandn $f20, $f20, $f28 \n\t"
- "pandn $f22, $f22, $f30 \n\t"
- "gslqc1 $f30, $f28, 640-480(%[tmp]) \n\t"
- "paddh $f28, $f28, $f4 \n\t"
- "paddh $f30, $f30, $f6 \n\t"
- "gslqc1 $f6, $f4, 640-400(%[tmp]) \n\t"
- "paddh $f28, $f28, $f4 \n\t"
- "paddh $f30, $f30, $f6 \n\t"
- "gslqc1 $f6, $f4, 640-544(%[tmp]) \n\t"
- "dmtc1 %[iAlpha], $f10 \n\t"
- "gssqc1 $f22, $f20, 640-352(%[tmp]) \n\t"
- "gslqc1 $f22, $f20, 640-368(%[tmp]) \n\t"
- "psllh $f28, $f28, $f10 \n\t"
- "psllh $f30, $f30, $f10 \n\t"
- "dli %[iAlpha], 0x3 \n\t"
- "paddh $f28, $f28, $f24 \n\t"
- "paddh $f30, $f30, $f26 \n\t"
- "paddh $f20, $f20, $f28 \n\t"
- "paddh $f22, $f22, $f30 \n\t"
- "dmtc1 %[iAlpha], $f10 \n\t"
-
- "dli %[iAlpha], 0x2 \n\t"
- "gslqc1 $f30, $f28, 640-400(%[tmp]) \n\t"
- "psrah $f20, $f20, $f10 \n\t"
- "psrah $f22, $f22, $f10 \n\t"
- "and $f4, $f4, $f20 \n\t"
- "and $f6, $f6, $f22 \n\t"
- "gslqc1 $f22, $f20, 640-480(%[tmp]) \n\t"
- "gssqc1 $f6, $f4, 640-96(%[tmp]) \n\t"
- "gslqc1 $f6, $f4, 640-384(%[tmp]) \n\t"
- "gslqc1 $f10, $f8, 640-400(%[tmp]) \n\t"
- "paddh $f24, $f4, $f4 \n\t"
- "paddh $f26, $f6, $f6 \n\t"
- "paddh $f4, $f4, $f8 \n\t"
- "paddh $f6, $f6, $f10 \n\t"
- "gslqc1 $f10, $f8, 640-144(%[tmp]) \n\t"
- "paddh $f28, $f28, $f20 \n\t"
- "paddh $f30, $f30, $f22 \n\t"
- "paddh $f4, $f4, $f8 \n\t"
- "paddh $f6, $f6, $f10 \n\t"
- "gslqc1 $f10, $f8, 640-592(%[tmp]) \n\t"
- "paddh $f24, $f24, $f28 \n\t"
- "paddh $f26, $f26, $f30 \n\t"
- "paddh $f20, $f20, $f8 \n\t"
- "paddh $f22, $f22, $f10 \n\t"
- "gslqc1 $f10, $f8, 640-624(%[tmp]) \n\t"
- "paddh $f24, $f24, $f8 \n\t"
- "dmtc1 %[iAlpha], $f8 \n\t"
- "paddh $f26, $f26, $f10 \n\t"
- "dli %[iAlpha], 0x1 \n\t"
- "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
- "dmtc1 %[iAlpha], $f10 \n\t"
- "psrah $f24, $f24, $f8 \n\t"
- "psrah $f26, $f26, $f8 \n\t"
- "psllh $f4, $f4, $f10 \n\t"
- "psllh $f6, $f6, $f10 \n\t"
- "paddh $f4, $f4, $f20 \n\t"
- "paddh $f6, $f6, $f22 \n\t"
- "dli %[iAlpha], 0x3 \n\t"
-
- "gslqc1 $f22, $f20, 656-272(%[tmp]) \n\t"
- "pandn $f28, $f28, $f24 \n\t"
- "pandn $f30, $f30, $f26 \n\t"
- "gslqc1 $f26, $f24, 640-416(%[tmp]) \n\t"
- "dmtc1 %[iAlpha], $f10 \n\t"
- "paddh $f24, $f24, $f4 \n\t"
- "paddh $f26, $f26, $f6 \n\t"
- "gslqc1 $f6, $f4, 640-560(%[tmp]) \n\t"
- "psrah $f24, $f24, $f10 \n\t"
- "psrah $f26, $f26, $f10 \n\t"
- "and $f4, $f4, $f24 \n\t"
- "and $f6, $f6, $f26 \n\t"
-
- "xor $f8, $f8, $f8 \n\t"
- "gslqc1 $f26, $f24, 704-272(%[tmp]) \n\t"
- "gssqc1 $f6, $f4, 640-128(%[tmp]) \n\t"
- "gslqc1 $f6, $f4, 672-272(%[tmp]) \n\t"
- "punpcklbh $f4, $f6, $f8 \n\t"
- "punpckhbh $f6, $f6, $f8 \n\t"
- "gssqc1 $f6, $f4, 640-448(%[tmp]) \n\t"
- "gslqc1 $f6, $f4, 688-272(%[tmp]) \n\t"
- "punpcklbh $f4, $f6, $f8 \n\t"
- "punpckhbh $f6, $f6, $f8 \n\t"
- "punpcklbh $f24, $f26, $f8 \n\t"
- "punpckhbh $f26, $f26, $f8 \n\t"
- "gssqc1 $f30, $f28, 640-288(%[tmp]) \n\t"
- "punpcklbh $f20, $f22, $f8 \n\t"
- "punpckhbh $f22, $f22, $f8 \n\t"
- "gslqc1 $f30, $f28, 0x0($14) \n\t"
- "gssqc1 $f6, $f4, 640-496(%[tmp]) \n\t"
- "gssqc1 $f26, $f24, 640-432(%[tmp]) \n\t"
-
- "gsldxc1 $f0, 0x8($15, $0) \n\t"
- "punpcklbh $f28, $f30, $f8 \n\t"
- "punpckhbh $f30, $f30, $f8 \n\t"
- "gssqc1 $f30, $f28, 640-464(%[tmp]) \n\t"
-
- "punpcklbh $f28, $f0, $f8 \n\t"
- "punpckhbh $f30, $f0, $f8 \n\t"
- "gslqc1 $f10, $f8, 640-464(%[tmp]) \n\t"
- "gssqc1 $f30, $f28, 640-528(%[tmp]) \n\t"
-
- "psubh $f28, $f24, $f4 \n\t"
- "psubh $f30, $f26, $f6 \n\t"
- "psubh $f24, $f24, $f8 \n\t"
- "psubh $f26, $f26, $f10 \n\t"
- WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
- "gslqc1 $f10, $f8, 640-16(%[tmp]) \n\t"
- "gssqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
- "or $f16, $f16, $f8 \n\t"
- "or $f18, $f18, $f10 \n\t"
- WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
- "gslqc1 $f30, $f28, 640-448(%[tmp]) \n\t"
- "psubh $f28, $f4, $f28 \n\t"
- "psubh $f30, $f6, $f30 \n\t"
-
- "gslqc1 $f2, $f0, 640-512(%[tmp]) \n\t"
- WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
- "pcmpgth $f4, $f0, $f28 \n\t"
- "pcmpgth $f6, $f2, $f30 \n\t"
- "pcmpgth $f28, $f0, $f24 \n\t"
- "pcmpgth $f30, $f2, $f26 \n\t"
- "gslqc1 $f26, $f24, 640-320(%[tmp]) \n\t"
- "and $f4, $f4, $f28 \n\t"
- "and $f6, $f6, $f30 \n\t"
- "gslqc1 $f30, $f28, 640-560(%[tmp]) \n\t"
- "pcmpgth $f24, $f24, $f28 \n\t"
- "pcmpgth $f26, $f26, $f30 \n\t"
- "and $f4, $f4, $f24 \n\t"
- "and $f6, $f6, $f26 \n\t"
-
- "gslqc1 $f26, $f24, 640-576(%[tmp]) \n\t"
- "pcmpgth $f24, $f24, $f28 \n\t"
- "pcmpgth $f26, $f26, $f30 \n\t"
- "xor $f8, $f8, $f8 \n\t"
- "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
- "punpcklbh $f12, $f14, $f8 \n\t"
- "punpckhbh $f14, $f14, $f8 \n\t"
- "gssqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
- "gslqc1 $f26, $f24, 640-512(%[tmp]) \n\t"
- "psubh $f28, $f28, $f20 \n\t"
- "psubh $f30, $f30, $f22 \n\t"
- WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
- "pcmpgth $f24, $f24, $f28 \n\t"
- "pcmpgth $f26, $f26, $f30 \n\t"
-
- "dli %[iAlpha], 0x1 \n\t"
- "gslqc1 $f10, $f8, 640-560(%[tmp]) \n\t"
- "and $f24, $f24, $f8 \n\t"
- "and $f26, $f26, $f10 \n\t"
- "gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t"
- "gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t"
- "psubh $f28, $f28, $f8 \n\t"
- "psubh $f30, $f30, $f10 \n\t"
- "dmtc1 %[iAlpha], $f10 \n\t"
-
- "psllh $f12, $f12, $f10 \n\t"
- "psllh $f14, $f14, $f10 \n\t"
- "gssqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
- "gslqc1 $f26, $f24, 640-512(%[tmp]) \n\t"
-
- "gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t"
- "paddh $f12, $f12, $f20 \n\t"
- "paddh $f14, $f14, $f22 \n\t"
- "paddh $f12, $f12, $f20 \n\t"
- "paddh $f14, $f14, $f22 \n\t"
- "paddh $f12, $f12, $f20 \n\t"
- "paddh $f14, $f14, $f22 \n\t"
- "paddh $f12, $f12, $f8 \n\t"
- "paddh $f14, $f14, $f10 \n\t"
- "gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t"
- "gslqc1 $f2, $f0, 640-560(%[tmp]) \n\t"
- "paddh $f12, $f12, $f8 \n\t"
- "paddh $f14, $f14, $f10 \n\t"
- WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
- "pcmpgth $f24, $f24, $f28 \n\t"
- "pcmpgth $f26, $f26, $f30 \n\t"
- "and $f24, $f24, $f0 \n\t"
- "and $f26, $f26, $f2 \n\t"
- "gssqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
- "gslqc1 $f10, $f8, 640-544(%[tmp]) \n\t"
-
- "gslqc1 $f2, $f0, 736-272(%[tmp]) \n\t"
- "dli %[iAlpha], 0x3 \n\t"
- "gslqc1 $f30, $f28, 640-368(%[tmp]) \n\t"
- "and $f24, $f0, $f16 \n\t"
- "and $f26, $f2, $f18 \n\t"
- "pandn $f16, $f0, $f28 \n\t"
- "pandn $f18, $f2, $f30 \n\t"
- "or $f24, $f24, $f16 \n\t"
- "or $f26, $f26, $f18 \n\t"
- "gslqc1 $f18, $f16, 640-432(%[tmp]) \n\t"
- "paddh $f12, $f12, $f16 \n\t"
- "paddh $f14, $f14, $f18 \n\t"
- "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
- "paddh $f12, $f12, $f28 \n\t"
- "paddh $f14, $f14, $f30 \n\t"
- "dmtc1 %[iAlpha], $f28 \n\t"
- "psrah $f12, $f12, $f28 \n\t"
- "psrah $f14, $f14, $f28 \n\t"
- "and $f12, $f12, $f8 \n\t"
- "and $f14, $f14, $f10 \n\t"
- "pandn $f8, $f8, $f20 \n\t"
- "pandn $f10, $f10, $f22 \n\t"
- "or $f12, $f12, $f8 \n\t"
- "or $f14, $f14, $f10 \n\t"
- "and $f28, $f4, $f12 \n\t"
- "and $f30, $f6, $f14 \n\t"
- "gslqc1 $f14, $f12, 640-64(%[tmp]) \n\t"
- "gslqc1 $f10, $f8, 640-336(%[tmp]) \n\t"
- "or $f12, $f12, $f8 \n\t"
- "or $f14, $f14, $f10 \n\t"
- "pandn $f8, $f4, $f20 \n\t"
- "pandn $f10, $f6, $f22 \n\t"
- "or $f28, $f28, $f8 \n\t"
- "or $f30, $f30, $f10 \n\t"
-
- "dli %[iAlpha], 0x2 \n\t"
- "and $f8, $f0, $f12 \n\t"
- "and $f10, $f2, $f14 \n\t"
- "gslqc1 $f14, $f12, 640-480(%[tmp]) \n\t"
- "pandn $f12, $f0, $f12 \n\t"
- "pandn $f14, $f2, $f14 \n\t"
- "or $f8, $f8, $f12 \n\t"
- "or $f10, $f10, $f14 \n\t"
- "packushb $f24, $f24, $f26 \n\t"
- "packushb $f26, $f28, $f30 \n\t"
- "gssqc1 $f10, $f8, 640-336(%[tmp]) \n\t"
- "gssqc1 $f26, $f24, 656-272(%[tmp]) \n\t"
- "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
- "gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t"
- "paddh $f8, $f20, $f8 \n\t"
- "paddh $f10, $f22, $f10 \n\t"
- "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
- "paddh $f28, $f28, $f16 \n\t"
- "paddh $f30, $f30, $f18 \n\t"
- "paddh $f8, $f8, $f28 \n\t"
- "paddh $f10, $f10, $f30 \n\t"
- "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
- "paddh $f8, $f8, $f28 \n\t"
- "paddh $f10, $f10, $f30 \n\t"
- "dmtc1 %[iAlpha], $f28 \n\t"
- "psrah $f8, $f8, $f28 \n\t"
- "psrah $f10, $f10, $f28 \n\t"
- "dli %[iAlpha], 0x1 \n\t"
- "gslqc1 $f30, $f28, 640-544(%[tmp]) \n\t"
- "and $f24, $f24, $f8 \n\t"
- "and $f26, $f26, $f10 \n\t"
- "gslqc1 $f10, $f8, 640-448(%[tmp]) \n\t"
- "pandn $f28, $f28, $f8 \n\t"
- "pandn $f30, $f30, $f10 \n\t"
- "or $f24, $f24, $f28 \n\t"
- "or $f26, $f26, $f30 \n\t"
- "and $f12, $f4, $f24 \n\t"
- "and $f14, $f6, $f26 \n\t"
- "pandn $f24, $f4, $f8 \n\t"
- "pandn $f26, $f6, $f10 \n\t"
- "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
- "paddh $f8, $f8, $f28 \n\t"
- "paddh $f10, $f10, $f30 \n\t"
- "paddh $f8, $f8, $f16 \n\t"
- "paddh $f10, $f10, $f18 \n\t"
- "or $f12, $f12, $f24 \n\t"
- "or $f14, $f14, $f26 \n\t"
- "gslqc1 $f26, $f24, 640-336(%[tmp]) \n\t"
- "dmtc1 %[iAlpha], $f28 \n\t"
- "packushb $f24, $f24, $f26 \n\t"
- "packushb $f26, $f12, $f14 \n\t"
- "psllh $f8, $f8, $f28 \n\t"
- "psllh $f10, $f10, $f28 \n\t"
- "gssqc1 $f26, $f24, 672-272(%[tmp]) \n\t"
- "gslqc1 $f26, $f24, 640-96(%[tmp]) \n\t"
- "gslqc1 $f30, $f28, 640-352(%[tmp]) \n\t"
- "or $f24, $f24, $f28 \n\t"
- "or $f26, $f26, $f30 \n\t"
- "dli %[iAlpha], 0x3 \n\t"
-
- "and $f12, $f0, $f24 \n\t"
- "and $f14, $f2, $f26 \n\t"
- "gslqc1 $f26, $f24, 640-144(%[tmp]) \n\t"
- "pandn $f24, $f0, $f24 \n\t"
- "pandn $f26, $f2, $f26 \n\t"
- "or $f12, $f12, $f24 \n\t"
- "or $f14, $f14, $f26 \n\t"
- "gslqc1 $f26, $f24, 640-544(%[tmp]) \n\t"
- "gssqc1 $f14, $f12, 640-352(%[tmp]) \n\t"
- "gslqc1 $f14, $f12, 640-464(%[tmp]) \n\t"
- "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
- "paddh $f12, $f12, $f28 \n\t"
- "paddh $f14, $f14, $f30 \n\t"
- "paddh $f8, $f8, $f12 \n\t"
- "paddh $f10, $f10, $f14 \n\t"
- "gslqc1 $f14, $f12, 640-448(%[tmp]) \n\t"
- "paddh $f20, $f20, $f8 \n\t"
- "paddh $f22, $f22, $f10 \n\t"
- "dmtc1 %[iAlpha], $f28 \n\t"
- "gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t"
- "psrah $f20, $f20, $f28 \n\t"
- "psrah $f22, $f22, $f28 \n\t"
- "and $f24, $f24, $f20 \n\t"
- "and $f26, $f26, $f22 \n\t"
- "gslqc1 $f22, $f20, 640-464(%[tmp]) \n\t"
- "paddh $f8, $f8, $f20 \n\t"
- "paddh $f10, $f10, $f22 \n\t"
- "gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t"
- "dli %[iAlpha], 0x2 \n\t"
- "paddh $f20, $f20, $f28 \n\t"
- "paddh $f22, $f22, $f30 \n\t"
- "paddh $f16, $f12, $f12 \n\t"
- "paddh $f18, $f14, $f14 \n\t"
- "paddh $f16, $f16, $f8 \n\t"
- "paddh $f18, $f18, $f10 \n\t"
- "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
- "paddh $f16, $f16, $f28 \n\t"
- "paddh $f18, $f18, $f30 \n\t"
- "gslqc1 $f10, $f8, 640-544(%[tmp]) \n\t"
- "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
- "paddh $f12, $f12, $f28 \n\t"
- "paddh $f14, $f14, $f30 \n\t"
- "dmtc1 %[iAlpha], $f28 \n\t"
- "psrah $f16, $f16, $f28 \n\t"
- "psrah $f18, $f18, $f28 \n\t"
- "pandn $f8, $f8, $f16 \n\t"
- "pandn $f10, $f10, $f18 \n\t"
- "or $f24, $f24, $f8 \n\t"
- "or $f26, $f26, $f10 \n\t"
- "and $f28, $f4, $f24 \n\t"
- "and $f30, $f6, $f26 \n\t"
- "gslqc1 $f26, $f24, 640-496(%[tmp]) \n\t"
- "pandn $f8, $f4, $f24 \n\t"
- "pandn $f10, $f6, $f26 \n\t"
- "or $f28, $f28, $f8 \n\t"
- "or $f30, $f30, $f10 \n\t"
- "gslqc1 $f10, $f8, 640-352(%[tmp]) \n\t"
- "packushb $f8, $f8, $f10 \n\t"
- "packushb $f10, $f28, $f30 \n\t"
- "gssqc1 $f10, $f8, 688-272(%[tmp]) \n\t"
- "gslqc1 $f10, $f8, 640-128(%[tmp]) \n\t"
- "gslqc1 $f30, $f28, 640-288(%[tmp]) \n\t"
- "or $f8, $f8, $f28 \n\t"
- "or $f10, $f10, $f30 \n\t"
- "dli %[iAlpha], 0x1 \n\t"
-
- "and $f16, $f0, $f8 \n\t"
- "and $f18, $f2, $f10 \n\t"
- "paddh $f20, $f20, $f24 \n\t"
- "paddh $f22, $f22, $f26 \n\t"
- "gslqc1 $f30, $f28, 640-400(%[tmp]) \n\t"
- "pandn $f8, $f0, $f28 \n\t"
- "pandn $f10, $f2, $f30 \n\t"
- "or $f16, $f16, $f8 \n\t"
- "or $f18, $f18, $f10 \n\t"
- "dmtc1 %[iAlpha], $f28 \n\t"
- "gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t"
- "dli %[iAlpha], 0x3 \n\t"
- "psllh $f20, $f20, $f28 \n\t"
- "psllh $f22, $f22, $f28 \n\t"
- "paddh $f20, $f20, $f12 \n\t"
- "paddh $f22, $f22, $f14 \n\t"
- "dmtc1 %[iAlpha], $f28 \n\t"
- "gslqc1 $f14, $f12, 640-560(%[tmp]) \n\t"
- "paddh $f8, $f8, $f20 \n\t"
- "paddh $f10, $f10, $f22 \n\t"
- "psrah $f8, $f8, $f28 \n\t"
- "psrah $f10, $f10, $f28 \n\t"
- "gssqc1 $f18, $f16, 640-288(%[tmp]) \n\t"
- "gslqc1 $f18, $f16, 640-560(%[tmp]) \n\t"
- "and $f16, $f16, $f8 \n\t"
- "and $f18, $f18, $f10 \n\t"
- "gslqc1 $f10, $f8, 640-464(%[tmp]) \n\t"
- "paddh $f20, $f8, $f8 \n\t"
- "paddh $f22, $f10, $f10 \n\t"
- "gslqc1 $f10, $f8, 640-432(%[tmp]) \n\t"
- "gslqc1 $f30, $f28, 640-448(%[tmp]) \n\t"
- "paddh $f8, $f8, $f28 \n\t"
- "paddh $f10, $f10, $f30 \n\t"
- "dli %[iAlpha], 0x2 \n\t"
- "paddh $f20, $f20, $f8 \n\t"
- "paddh $f22, $f22, $f10 \n\t"
- "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
- "paddh $f20, $f20, $f28 \n\t"
- "paddh $f22, $f22, $f30 \n\t"
- "dmtc1 %[iAlpha], $f28 \n\t"
- "gslqc1 $f26, $f24, 640-560(%[tmp]) \n\t"
- "psrah $f20, $f20, $f28 \n\t"
- "psrah $f22, $f22, $f28 \n\t"
- "pandn $f12, $f12, $f20 \n\t"
- "pandn $f14, $f14, $f22 \n\t"
- "or $f16, $f16, $f12 \n\t"
- "or $f18, $f18, $f14 \n\t"
- "gslqc1 $f14, $f12, 640-32(%[tmp]) \n\t"
- "gslqc1 $f30, $f28, 640-304(%[tmp]) \n\t"
- "or $f12, $f12, $f28 \n\t"
- "or $f14, $f14, $f30 \n\t"
- "and $f28, $f4, $f16 \n\t"
- "and $f30, $f6, $f18 \n\t"
- "gslqc1 $f18, $f16, 640-432(%[tmp]) \n\t"
- "gslqc1 $f22, $f20, 640-464(%[tmp]) \n\t"
- "pandn $f8, $f4, $f16 \n\t"
- "pandn $f10, $f6, $f18 \n\t"
- "or $f28, $f28, $f8 \n\t"
- "or $f30, $f30, $f10 \n\t"
- "gslqc1 $f10, $f8, 640-496(%[tmp]) \n\t"
- "paddh $f16, $f16, $f8 \n\t"
- "paddh $f18, $f18, $f10 \n\t"
- "gslqc1 $f10, $f8, 640-288(%[tmp]) \n\t"
- "packushb $f8, $f8, $f10 \n\t"
- "packushb $f10, $f28, $f30 \n\t"
- "dli %[iAlpha], 0x2 \n\t"
- "gssqc1 $f10, $f8, 704-272(%[tmp]) \n\t"
-
- "and $f8, $f0, $f12 \n\t"
- "and $f10, $f2, $f14 \n\t"
- "gslqc1 $f30, $f28, 640-384(%[tmp]) \n\t"
- "pandn $f12, $f0, $f28 \n\t"
- "pandn $f14, $f2, $f30 \n\t"
- "or $f8, $f8, $f12 \n\t"
- "or $f10, $f10, $f14 \n\t"
- "gssqc1 $f10, $f8, 640-304(%[tmp]) \n\t"
- "gslqc1 $f10, $f8, 640-528(%[tmp]) \n\t"
- "gslqc1 $f30, $f28, 640-464(%[tmp]) \n\t"
- "paddh $f12, $f8, $f28 \n\t"
- "paddh $f14, $f10, $f30 \n\t"
- "paddh $f12, $f12, $f16 \n\t"
- "paddh $f14, $f14, $f18 \n\t"
- "gslqc1 $f30, $f28, 640-624(%[tmp]) \n\t"
- "paddh $f12, $f12, $f28 \n\t"
- "paddh $f14, $f14, $f30 \n\t"
- "dmtc1 %[iAlpha], $f28 \n\t"
- "psrah $f12, $f12, $f28 \n\t"
- "psrah $f14, $f14, $f28 \n\t"
- "and $f24, $f24, $f12 \n\t"
- "and $f26, $f26, $f14 \n\t"
- "gslqc1 $f14, $f12, 640-560(%[tmp]) \n\t"
- "pandn $f16, $f12, $f20 \n\t"
- "pandn $f18, $f14, $f22 \n\t"
- "or $f24, $f24, $f16 \n\t"
- "or $f26, $f26, $f18 \n\t"
- "and $f28, $f4, $f24 \n\t"
- "and $f30, $f6, $f26 \n\t"
- "gslqc1 $f26, $f24, 640-304(%[tmp]) \n\t"
- "pandn $f16, $f4, $f20 \n\t"
- "pandn $f18, $f6, $f22 \n\t"
- "or $f28, $f28, $f16 \n\t"
- "or $f30, $f30, $f18 \n\t"
- "dli %[iAlpha], 0x1 \n\t"
-
- "packushb $f24, $f24, $f26 \n\t"
- "packushb $f26, $f28, $f30 \n\t"
- "gslqc1 $f30, $f28, 640-112(%[tmp]) \n\t"
- "gslqc1 $f18, $f16, 640-80(%[tmp]) \n\t"
- "or $f28, $f28, $f16 \n\t"
- "or $f30, $f30, $f18 \n\t"
- "and $f16, $f0, $f28 \n\t"
- "and $f18, $f2, $f30 \n\t"
- "gslqc1 $f30, $f28, 640-416(%[tmp]) \n\t"
- "pandn $f0, $f0, $f28 \n\t"
- "pandn $f2, $f2, $f30 \n\t"
- "or $f16, $f16, $f0 \n\t"
- "or $f18, $f18, $f2 \n\t"
- "xor $f28, $f28, $f28 \n\t"
- "xor $f30, $f30, $f30 \n\t"
- "gslqc1 $f2, $f0, 0x0($12) \n\t"
- "dmtc1 %[iAlpha], $f28 \n\t"
- "punpcklbh $f0, $f2, $f30 \n\t"
- "punpckhbh $f2, $f2, $f30 \n\t"
- "psllh $f0, $f0, $f28 \n\t"
- "psllh $f2, $f2, $f28 \n\t"
- "paddh $f0, $f0, $f8 \n\t"
- "paddh $f2, $f2, $f10 \n\t"
- "paddh $f0, $f0, $f8 \n\t"
- "paddh $f2, $f2, $f10 \n\t"
- "paddh $f0, $f0, $f8 \n\t"
- "paddh $f2, $f2, $f10 \n\t"
- "paddh $f0, $f0, $f20 \n\t"
- "paddh $f2, $f2, $f22 \n\t"
- "dli %[iAlpha], 0x3 \n\t"
- "gslqc1 $f30, $f28, 640-432(%[tmp]) \n\t"
- "paddh $f0, $f0, $f28 \n\t"
- "paddh $f2, $f2, $f30 \n\t"
- "gslqc1 $f30, $f28, 640-496(%[tmp]) \n\t"
- "paddh $f0, $f0, $f28 \n\t"
- "paddh $f2, $f2, $f30 \n\t"
- "gslqc1 $f30, $f28, 640-592(%[tmp]) \n\t"
- "paddh $f0, $f0, $f28 \n\t"
- "paddh $f2, $f2, $f30 \n\t"
- "dmtc1 %[iAlpha], $f28 \n\t"
- "psrah $f0, $f0, $f28 \n\t"
- "psrah $f2, $f2, $f28 \n\t"
- "and $f0, $f0, $f12 \n\t"
- "and $f2, $f2, $f14 \n\t"
- "pandn $f12, $f12, $f8 \n\t"
- "pandn $f14, $f14, $f10 \n\t"
- "or $f0, $f0, $f12 \n\t"
- "or $f2, $f2, $f14 \n\t"
- "and $f28, $f4, $f0 \n\t"
- "and $f30, $f6, $f2 \n\t"
-
- "gslqc1 $f2, $f0, 656-272(%[tmp]) \n\t"
- "gssqc1 $f2, $f0, 0x0($11) \n\t"
-
- "gslqc1 $f2, $f0, 672-272(%[tmp]) \n\t"
-
- "gssqc1 $f2, $f0, 0x0($8) \n\t"
- "gslqc1 $f2, $f0, 688-272(%[tmp]) \n\t"
- "gssqc1 $f2, $f0, 0x0($9) \n\t"
- "gslqc1 $f2, $f0, 704-272(%[tmp]) \n\t"
-
- "pandn $f4, $f4, $f8 \n\t"
- "pandn $f6, $f6, $f10 \n\t"
- "gssqc1 $f2, $f0, 0x0(%[pPix]) \n\t"
- "or $f28, $f28, $f4 \n\t"
- "or $f30, $f30, $f6 \n\t"
- "packushb $f16, $f16, $f18 \n\t"
- "packushb $f18, $f28, $f30 \n\t"
- "gssqc1 $f26, $f24, 0x0($13) \n\t"
- "gssqc1 $f18, $f16, 0x0(%[iStride]) \n\t"
- : [pPix]"+&r"((unsigned char *)pPix)
- : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
- [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
- : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0",
- "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
- "$f22", "$f24", "$f26", "$f28", "$f30"
- );
- RECOVER_REG;
-}
-
-void DeblockChromaLt4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
- int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
- unsigned char tmp[256] __attribute__((aligned(32)));
- BACKUP_REG;
- __asm__ volatile (
- ".set arch=loongson3a \n\t"
- "lb $8, 0x2(%[pTC]) \n\t"
- "lb $9, 0x3(%[pTC]) \n\t"
- "move $11, $8 \n\t"
- "lb $8, 0x1(%[pTC]) \n\t"
- "lb %[pTC], 0x0(%[pTC]) \n\t"
- "move $12, %[pTC] \n\t"
- "and %[pTC], $9, 0xFFFF \n\t"
- "dmtc1 %[pTC], $f4 \n\t"
- "and %[pTC], $9, 0xFFFF \n\t"
- "dmtc1 %[pTC], $f8 \n\t"
- "move %[pTC], $11 \n\t"
- "and $9, %[pTC], 0xFFFF \n\t"
- "and %[pTC], %[pTC], 0xFFFF \n\t"
- "dmtc1 %[pTC], $f16 \n\t"
- "and %[pTC], $8, 0xFFFF \n\t"
- "dmtc1 %[pTC], $f20 \n\t"
- "dmtc1 $9, $f12 \n\t"
- "and %[pTC], $8, 0xFFFF \n\t"
- "dmtc1 %[pTC], $f24 \n\t"
- "move %[pTC], $12 \n\t"
- "and $9, %[pTC], 0xFFFF \n\t"
- "and %[pTC], %[pTC], 0xFFFF \n\t"
- "punpcklhw $f24, $f24, $f8 \n\t"
- "xor $f0, $f0, $f0 \n\t"
- "xor $f2, $f2, $f2 \n\t"
- "gssqc1 $f2, $f0, 0x40(%[tmp]) \n\t"
- "dmtc1 $9, $f28 \n\t"
- "dmtc1 %[pTC], $f0 \n\t"
- "daddu %[pTC], %[iStride], %[iStride] \n\t"
- "dsubu $9, %[pPixCb], %[pTC] \n\t"
- "punpcklhw $f20, $f20, $f4 \n\t"
- "gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
- "punpcklhw $f0, $f0, $f16 \n\t"
- "gsldxc1 $f16, 0x0(%[iStride], %[pPixCr]) \n\t"
- "punpcklhw $f28, $f28, $f12 \n\t"
- "gsldxc1 $f12, 0x0(%[pPixCb], $0) \n\t"
- "punpcklhw $f0, $f0, $f24 \n\t"
- "gsldxc1 $f24, 0x0($9, $0) \n\t"
- "punpcklhw $f28, $f28, $f20 \n\t"
- "punpckhhw $f2, $f0, $f28 \n\t"
- "punpcklhw $f0, $f0, $f28 \n\t"
- "dsubu $9, %[pPixCr], %[pTC] \n\t"
- "psubh $f8, $f4, $f0 \n\t"
- "psubh $f10, $f6, $f2 \n\t"
- "gssqc1 $f10, $f8, 0x60(%[tmp]) \n\t"
- "gsldxc1 $f8, 0x0($9, $0) \n\t"
- "mov.d $f26, $f8 \n\t"
- "dsubu %[pTC], %[pPixCb], %[iStride] \n\t"
- "gsldxc1 $f28, 0x0(%[pTC], $0) \n\t"
- "dsubu $9, %[pPixCr], %[iStride] \n\t"
- "gsldxc1 $f8, 0x0($9, $0) \n\t"
- "mov.d $f30, $f8 \n\t"
- "gsldxc1 $f8, 0x0(%[pPixCr], $0) \n\t"
- "mov.d $f14, $f8 \n\t"
- "gsldxc1 $f8, 0x0(%[iStride], %[pPixCb]) \n\t"
- "mov.d $f10, $f16 \n\t"
- "gssqc1 $f10, $f8, 0xE0(%[tmp]) \n\t"
- "dmtc1 %[iAlpha], $f8 \n\t"
- "punpcklhw $f16, $f8, $f8 \n\t"
- "dmtc1 %[iBeta], $f8 \n\t"
- "punpcklhw $f20, $f8, $f8 \n\t"
- "punpcklwd $f8, $f20, $f20 \n\t"
- "mov.d $f10, $f8 \n\t"
- "gssqc1 $f10, $f8, 0x50(%[tmp]) \n\t"
- "punpckhbh $f10, $f24, $f4 \n\t"
- "punpcklbh $f8, $f24, $f4 \n\t"
- "gssqc1 $f14, $f12, 0xd0(%[tmp]) \n\t"
- "punpcklwd $f16, $f16, $f16 \n\t"
- "mov.d $f18, $f16 \n\t"
- "gssqc1 $f10, $f8, 0x30(%[tmp]) \n\t"
- "punpcklbh $f24, $f26, $f6 \n\t"
- "punpckhbh $f26, $f26, $f6 \n\t"
- "gssqc1 $f26, $f24, 0x80(%[tmp]) \n\t"
- "gslqc1 $f26, $f24, 0xd0(%[tmp]) \n\t"
- "punpcklbh $f24, $f26, $f6 \n\t"
- "punpckhbh $f26, $f26, $f6 \n\t"
- "gssqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
- "gslqc1 $f26, $f24, 0xe0(%[tmp]) \n\t"
- "punpcklbh $f24, $f26, $f6 \n\t"
- "punpckhbh $f26, $f26, $f6 \n\t"
- "gssqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
- "gslqc1 $f22, $f20, 0xe0(%[tmp]) \n\t"
- "mov.d $f8, $f28 \n\t"
- "mov.d $f10, $f30 \n\t"
- "punpcklbh $f28, $f30, $f6 \n\t"
- "punpckhbh $f30, $f30, $f6 \n\t"
- "punpckhbh $f22, $f20, $f4 \n\t"
- "punpcklbh $f20, $f20, $f4 \n\t"
- "gssqc1 $f30, $f28, 0xa0(%[tmp]) \n\t"
- "punpckhbh $f14, $f12, $f4 \n\t"
- "punpcklbh $f12, $f12, $f4 \n\t"
- "dli %[iBeta], 0x4 \n\t"
- "punpckhbh $f10, $f8, $f4 \n\t"
- "punpcklbh $f8, $f8, $f4 \n\t"
- "dmtc1 %[iBeta], $f24 \n\t"
- "punpcklhw $f28, $f24, $f24 \n\t"
- "punpcklwd $f24, $f28, $f28 \n\t"
- "mov.d $f26, $f24 \n\t"
- "gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t"
- "gssqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
- "psubh $f28, $f28, $f20 \n\t"
- "psubh $f30, $f30, $f22 \n\t"
- "pcmpgth $f24, $f0, $f4 \n\t"
- "pcmpgth $f26, $f2, $f6 \n\t"
- "gslqc1 $f6, $f4, 0x60(%[tmp]) \n\t"
- "gssqc1 $f26, $f24, 0x40(%[tmp]) \n\t"
- "psubh $f24, $f12, $f8 \n\t"
- "psubh $f26, $f14, $f10 \n\t"
- "dmfc1 %[iAlpha], $f12 \n\t"
- "dmfc1 %[iBeta], $f14 \n\t"
- "dli $10, 0x2 \n\t"
- "dmtc1 $10, $f12 \n\t"
- "dli $10, 0x3 \n\t"
- "dmtc1 $10, $f14 \n\t"
- "psllh $f24, $f24, $f12 \n\t"
- "psllh $f26, $f26, $f12 \n\t"
- "paddh $f24, $f24, $f28 \n\t"
- "paddh $f26, $f26, $f30 \n\t"
- "gslqc1 $f30, $f28, 0x20(%[tmp]) \n\t"
- "paddh $f24, $f24, $f28 \n\t"
- "paddh $f26, $f26, $f30 \n\t"
- "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
- "psrah $f24, $f24, $f14 \n\t"
- "psrah $f26, $f26, $f14 \n\t"
- "dmtc1 %[iAlpha], $f12 \n\t"
- "dmtc1 %[iBeta], $f14 \n\t"
- "pmaxsh $f4, $f4, $f24 \n\t"
- "pmaxsh $f6, $f6, $f26 \n\t"
- "gssqc1 $f2, $f0, 0x10(%[tmp]) \n\t"
- "gslqc1 $f26, $f24, 0x10(%[tmp]) \n\t"
- "pminsh $f24, $f24, $f4 \n\t"
- "pminsh $f26, $f26, $f6 \n\t"
- "gssqc1 $f26, $f24, 0x10(%[tmp]) \n\t"
- "psubh $f4, $f8, $f12 \n\t"
- "psubh $f6, $f10, $f14 \n\t"
- WELS_AbsH($f4, $f6, $f4, $f6, $f24, $f26)
- "pcmpgth $f24, $f16, $f4 \n\t"
- "pcmpgth $f26, $f18, $f6 \n\t"
- "gslqc1 $f6, $f4, 0x30(%[tmp]) \n\t"
- "psubh $f4, $f4, $f8 \n\t"
- "psubh $f6, $f6, $f10 \n\t"
- "dmfc1 %[iAlpha], $f8 \n\t"
- "dmfc1 %[iBeta], $f10 \n\t"
- WELS_AbsH($f4, $f6, $f4, $f6, $f8, $f10)
- "pcmpgth $f28, $f28, $f4 \n\t"
- "pcmpgth $f30, $f30, $f6 \n\t"
- "gslqc1 $f6, $f4, 0x50(%[tmp]) \n\t"
- "and $f24, $f24, $f28 \n\t"
- "and $f26, $f26, $f30 \n\t"
- "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
- "psubh $f20, $f20, $f12 \n\t"
- "psubh $f22, $f22, $f14 \n\t"
- WELS_AbsH($f20, $f22, $f20, $f22, $f8, $f10)
- "pcmpgth $f4, $f4, $f20 \n\t"
- "pcmpgth $f6, $f6, $f22 \n\t"
- "gslqc1 $f22, $f20, 0x80(%[tmp]) \n\t"
- "gslqc1 $f10, $f8, 0x90(%[tmp]) \n\t"
- "psubh $f20, $f20, $f8 \n\t"
- "psubh $f22, $f22, $f10 \n\t"
- "and $f24, $f24, $f4 \n\t"
- "and $f26, $f26, $f6 \n\t"
- "gslqc1 $f10, $f8, 0x40(%[tmp]) \n\t"
- "and $f24, $f24, $f8 \n\t"
- "and $f26, $f26, $f10 \n\t"
- "gslqc1 $f6, $f4, 0x10(%[tmp]) \n\t"
- "and $f4, $f4, $f24 \n\t"
- "and $f6, $f6, $f26 \n\t"
- "gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
- "gssqc1 $f6, $f4, 0x30(%[tmp]) \n\t"
- "gslqc1 $f6, $f4, 0xa0(%[tmp]) \n\t"
- "psubh $f24, $f24, $f4 \n\t"
- "psubh $f26, $f26, $f6 \n\t"
- "dli $10, 0x2 \n\t"
- "dmtc1 $10, $f8 \n\t"
- "psllh $f24, $f24, $f8 \n\t"
- "psllh $f26, $f26, $f8 \n\t"
- "paddh $f24, $f24, $f20 \n\t"
- "paddh $f26, $f26, $f22 \n\t"
- "dli $10, 0x3 \n\t"
- "gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
- "paddh $f24, $f24, $f8 \n\t"
- "paddh $f26, $f26, $f10 \n\t"
- "dmtc1 $10, $f8 \n\t"
- "gslqc1 $f22, $f20, 0x60(%[tmp]) \n\t"
- "psrah $f24, $f24, $f8 \n\t"
- "psrah $f26, $f26, $f8 \n\t"
- "pmaxsh $f20, $f20, $f24 \n\t"
- "pmaxsh $f22, $f22, $f26 \n\t"
- "pminsh $f0, $f0, $f20 \n\t"
- "pminsh $f2, $f2, $f22 \n\t"
- "gslqc1 $f22, $f20, 0x70(%[tmp]) \n\t"
- "psubh $f24, $f4, $f20 \n\t"
- "psubh $f26, $f6, $f22 \n\t"
- WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
- "pcmpgth $f16, $f16, $f24 \n\t"
- "pcmpgth $f18, $f18, $f26 \n\t"
- "gslqc1 $f26, $f24, 0x80(%[tmp]) \n\t"
- "psubh $f24, $f24, $f4 \n\t"
- "psubh $f26, $f26, $f6 \n\t"
- WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
- "pcmpgth $f28, $f28, $f24 \n\t"
- "pcmpgth $f30, $f30, $f26 \n\t"
- "gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
- "and $f16, $f16, $f28 \n\t"
- "and $f18, $f18, $f30 \n\t"
- "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
- "psubh $f24, $f24, $f20 \n\t"
- "psubh $f26, $f26, $f22 \n\t"
- WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
- "dmtc1 %[iAlpha], $f8 \n\t"
- "dmtc1 %[iBeta], $f10 \n\t"
- "pcmpgth $f28, $f28, $f24 \n\t"
- "pcmpgth $f30, $f30, $f26 \n\t"
- "and $f16, $f16, $f28 \n\t"
- "and $f18, $f18, $f30 \n\t"
- "gslqc1 $f26, $f24, 0x40(%[tmp]) \n\t"
- "and $f16, $f16, $f24 \n\t"
- "and $f18, $f18, $f26 \n\t"
- "and $f0, $f0, $f16 \n\t"
- "and $f2, $f2, $f18 \n\t"
- "gslqc1 $f18, $f16, 0x30(%[tmp]) \n\t"
- "paddh $f8, $f8, $f16 \n\t"
- "paddh $f10, $f10, $f18 \n\t"
- "paddh $f4, $f4, $f0 \n\t"
- "paddh $f6, $f6, $f2 \n\t"
- "packushb $f8, $f8, $f10 \n\t"
- "packushb $f10, $f4, $f6 \n\t"
- "gssdxc1 $f8, 0x0(%[pTC], $0) \n\t"
- "psubh $f12, $f12, $f16 \n\t"
- "psubh $f14, $f14, $f18 \n\t"
- "psubh $f20, $f20, $f0 \n\t"
- "psubh $f22, $f22, $f2 \n\t"
- "packushb $f12, $f12, $f14 \n\t"
- "packushb $f14, $f20, $f22 \n\t"
- "gssdxc1 $f12, 0x0(%[pPixCb], $0) \n\t"
- "gssdxc1 $f10, 0x0($9, $0) \n\t"
- "gssdxc1 $f14, 0x0(%[pPixCr], $0) \n\t"
- : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
- : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
- [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
- : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
- "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
- "$f28", "$f30"
- );
- RECOVER_REG;
-}
-
-void DeblockChromaEq4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
- int32_t iAlpha, int32_t iBeta) {
- unsigned char tmp[128] __attribute__((aligned(32)));
- BACKUP_REG;
- __asm__ volatile (
- ".set arch=loongson3a \n\t"
- "daddu $8, %[iStride], %[iStride] \n\t"
- "dsubu $9, %[pPixCb], $8 \n\t"
- "gsldxc1 $f16, 0x0(%[pPixCr], $0) \n\t"
- "gsldxc1 $f20, 0x0(%[iStride], %[pPixCr]) \n\t"
- "gsldxc1 $f4, 0x0($9, $0) \n\t"
- "dsubu $9, %[pPixCr], $8 \n\t"
- "gsldxc1 $f8, 0x0($9, $0) \n\t"
- "mov.d $f6, $f8 \n\t"
- "dsubu $8, %[pPixCb], %[iStride] \n\t"
- "gsldxc1 $f8, 0x0($8, $0) \n\t"
- "dsubu $9, %[pPixCr], %[iStride] \n\t"
- "gsldxc1 $f12, 0x0($9, $0) \n\t"
- "mov.d $f10, $f12 \n\t"
- "gsldxc1 $f12, 0x0(%[pPixCb], $0) \n\t"
- "mov.d $f14, $f16 \n\t"
- "gsldxc1 $f16, 0x0(%[iStride], %[pPixCb]) \n\t"
- "mov.d $f18, $f20 \n\t"
- "dmtc1 %[iAlpha], $f20 \n\t"
- "xor $f0, $f0, $f0 \n\t"
- "xor $f2, $f2, $f2 \n\t"
- "punpcklhw $f24, $f20, $f20 \n\t"
- "punpcklwd $f20, $f24, $f24 \n\t"
- "mov.d $f22, $f20 \n\t"
- "dmtc1 %[iBeta], $f24 \n\t"
- "punpcklhw $f28, $f24, $f24 \n\t"
- "punpcklwd $f24, $f28, $f28 \n\t"
- "mov.d $f26, $f24 \n\t"
- "mov.d $f28, $f4 \n\t"
- "punpcklbh $f4, $f6, $f2 \n\t"
- "punpckhbh $f6, $f6, $f2 \n\t"
- "punpckhbh $f30, $f28, $f0 \n\t"
- "punpcklbh $f28, $f28, $f0 \n\t"
- "gssqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
- "gssqc1 $f30, $f28, 0x60(%[tmp]) \n\t"
- "punpckhbh $f30, $f8, $f0 \n\t"
- "punpcklbh $f28, $f8, $f0 \n\t"
- "gssqc1 $f30, $f28, 0x10(%[tmp]) \n\t"
- "punpckhbh $f30, $f12, $f0 \n\t"
- "punpcklbh $f28, $f12, $f0 \n\t"
- "punpcklbh $f12, $f14, $f2 \n\t"
- "punpckhbh $f14, $f14, $f2 \n\t"
- "gssqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
- "mov.d $f28, $f16 \n\t"
- "punpcklbh $f16, $f18, $f2 \n\t"
- "punpckhbh $f18, $f18, $f2 \n\t"
- "punpcklbh $f8, $f10, $f2 \n\t"
- "punpckhbh $f10, $f10, $f2 \n\t"
- "punpckhbh $f30, $f28, $f0 \n\t"
- "punpcklbh $f28, $f28, $f0 \n\t"
- "gssqc1 $f14, $f12, 0x30(%[tmp]) \n\t"
- "gslqc1 $f14, $f12, 0x10(%[tmp]) \n\t"
- "gslqc1 $f2, $f0, 0x50(%[tmp]) \n\t"
- "psubh $f4, $f12, $f0 \n\t"
- "psubh $f6, $f14, $f2 \n\t"
- WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
- "gssqc1 $f18, $f16, 0x20(%[tmp]) \n\t"
- "pcmpgth $f0, $f20, $f4 \n\t"
- "pcmpgth $f2, $f22, $f6 \n\t"
- "gslqc1 $f6, $f4, 0x60(%[tmp]) \n\t"
- "psubh $f4, $f4, $f12 \n\t"
- "psubh $f6, $f6, $f14 \n\t"
- WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
- "pcmpgth $f16, $f24, $f4 \n\t"
- "pcmpgth $f18, $f26, $f6 \n\t"
- "and $f0, $f0, $f16 \n\t"
- "and $f2, $f2, $f18 \n\t"
- "gslqc1 $f18, $f16, 0x50(%[tmp]) \n\t"
- "psubh $f4, $f28, $f16 \n\t"
- "psubh $f6, $f30, $f18 \n\t"
- WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
- "pcmpgth $f16, $f24, $f4 \n\t"
- "pcmpgth $f18, $f26, $f6 \n\t"
- "gslqc1 $f6, $f4, 0x30(%[tmp]) \n\t"
- "psubh $f4, $f8, $f4 \n\t"
- "psubh $f6, $f10, $f6 \n\t"
- "dmfc1 %[iAlpha], $f28 \n\t"
- "dmfc1 %[iBeta], $f30 \n\t"
- WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
- "pcmpgth $f20, $f20, $f4 \n\t"
- "pcmpgth $f22, $f22, $f6 \n\t"
- "gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
- "and $f0, $f0, $f16 \n\t"
- "and $f2, $f2, $f18 \n\t"
- "psubh $f4, $f4, $f8 \n\t"
- "psubh $f6, $f6, $f10 \n\t"
- WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
- "pcmpgth $f16, $f24, $f4 \n\t"
- "pcmpgth $f18, $f26, $f6 \n\t"
- "gslqc1 $f6, $f4, 0x20(%[tmp]) \n\t"
- "gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t"
- "psubh $f4, $f4, $f28 \n\t"
- "psubh $f6, $f6, $f30 \n\t"
- "and $f20, $f20, $f16 \n\t"
- "and $f22, $f22, $f18 \n\t"
- WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
- "dmtc1 %[iAlpha], $f28 \n\t"
- "dmtc1 %[iBeta], $f30 \n\t"
- "pcmpgth $f24, $f24, $f4 \n\t"
- "pcmpgth $f26, $f26, $f6 \n\t"
- "and $f20, $f20, $f24 \n\t"
- "and $f22, $f22, $f26 \n\t"
- "dli %[iBeta], 0x2 \n\t"
- "dmtc1 %[iBeta], $f4 \n\t"
- "punpcklhw $f16, $f4, $f4 \n\t"
- "punpcklwd $f4, $f16, $f16 \n\t"
- "mov.d $f6, $f4 \n\t"
- "gslqc1 $f18, $f16, 0x60(%[tmp]) \n\t"
- "paddh $f24, $f16, $f16 \n\t"
- "paddh $f26, $f18, $f18 \n\t"
- "paddh $f24, $f24, $f12 \n\t"
- "paddh $f26, $f26, $f14 \n\t"
- "paddh $f24, $f24, $f28 \n\t"
- "paddh $f26, $f26, $f30 \n\t"
- "gssqc1 $f6, $f4, 0x10(%[tmp]) \n\t"
- "gslqc1 $f18, $f16, 0x10(%[tmp]) \n\t"
- "paddh $f24, $f24, $f16 \n\t"
- "paddh $f26, $f26, $f18 \n\t"
- "dmtc1 %[iBeta], $f16 \n\t"
- "psrah $f24, $f24, $f16 \n\t"
- "psrah $f26, $f26, $f16 \n\t"
- "pandn $f16, $f0, $f12 \n\t"
- "pandn $f18, $f2, $f14 \n\t"
- "gslqc1 $f14, $f12, 0x40(%[tmp]) \n\t"
- "and $f4, $f0, $f24 \n\t"
- "and $f6, $f2, $f26 \n\t"
- "or $f4, $f4, $f16 \n\t"
- "or $f6, $f6, $f18 \n\t"
- "paddh $f24, $f12, $f12 \n\t"
- "paddh $f26, $f14, $f14 \n\t"
- "gslqc1 $f14, $f12, 0x10(%[tmp]) \n\t"
- "paddh $f24, $f24, $f8 \n\t"
- "paddh $f26, $f26, $f10 \n\t"
- "gslqc1 $f18, $f16, 0x20(%[tmp]) \n\t"
- "paddh $f24, $f24, $f16 \n\t"
- "paddh $f26, $f26, $f18 \n\t"
- "dmtc1 %[iBeta], $f16 \n\t"
- "paddh $f24, $f24, $f12 \n\t"
- "paddh $f26, $f26, $f14 \n\t"
- "psrah $f24, $f24, $f16 \n\t"
- "psrah $f26, $f26, $f16 \n\t"
- "and $f16, $f20, $f24 \n\t"
- "and $f18, $f22, $f26 \n\t"
- "pandn $f24, $f20, $f8 \n\t"
- "pandn $f26, $f22, $f10 \n\t"
- "or $f16, $f16, $f24 \n\t"
- "or $f18, $f18, $f26 \n\t"
- "packushb $f4, $f4, $f6 \n\t"
- "packushb $f6, $f16, $f18 \n\t"
- "gslqc1 $f18, $f16, 0x50(%[tmp]) \n\t"
- "paddh $f24, $f28, $f28 \n\t"
- "paddh $f26, $f30, $f30 \n\t"
- "paddh $f24, $f24, $f16 \n\t"
- "paddh $f26, $f26, $f18 \n\t"
- "gslqc1 $f10, $f8, 0x60(%[tmp]) \n\t"
- "paddh $f24, $f24, $f8 \n\t"
- "paddh $f26, $f26, $f10 \n\t"
- "dmtc1 %[iBeta], $f28 \n\t"
- "paddh $f24, $f24, $f12 \n\t"
- "paddh $f26, $f26, $f14 \n\t"
- "psrah $f24, $f24, $f28 \n\t"
- "psrah $f26, $f26, $f28 \n\t"
- "and $f8, $f0, $f24 \n\t"
- "and $f10, $f2, $f26 \n\t"
- "pandn $f0, $f0, $f16 \n\t"
- "pandn $f2, $f2, $f18 \n\t"
- "or $f8, $f8, $f0 \n\t"
- "or $f10, $f10, $f2 \n\t"
- "gslqc1 $f2, $f0, 0x20(%[tmp]) \n\t"
- "paddh $f24, $f0, $f0 \n\t"
- "paddh $f26, $f2, $f2 \n\t"
- "gslqc1 $f2, $f0, 0x30(%[tmp]) \n\t"
- "paddh $f24, $f24, $f0 \n\t"
- "paddh $f26, $f26, $f2 \n\t"
- "gslqc1 $f18, $f16, 0x40(%[tmp]) \n\t"
- "paddh $f24, $f24, $f16 \n\t"
- "paddh $f26, $f26, $f18 \n\t"
- "paddh $f24, $f24, $f12 \n\t"
- "paddh $f26, $f26, $f14 \n\t"
- "gssdxc1 $f4, 0x0($8, $0) \n\t"
- "psrah $f24, $f24, $f28 \n\t"
- "psrah $f26, $f26, $f28 \n\t"
- "and $f16, $f20, $f24 \n\t"
- "and $f18, $f22, $f26 \n\t"
- "pandn $f20, $f20, $f0 \n\t"
- "pandn $f22, $f22, $f2 \n\t"
- "or $f16, $f16, $f20 \n\t"
- "or $f18, $f18, $f22 \n\t"
- "packushb $f8, $f8, $f10 \n\t"
- "packushb $f10, $f16, $f18 \n\t"
- "gssdxc1 $f8, 0x0(%[pPixCb], $0) \n\t"
- "gssdxc1 $f6, 0x0($9, $0) \n\t"
- "gssdxc1 $f10, 0x0(%[pPixCr], $0) \n\t"
- : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
- : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
- [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
- : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
- "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
- "$f28", "$f30"
- );
- RECOVER_REG;
-}
-
-void DeblockChromaEq4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
- int32_t iAlpha, int32_t iBeta) {
- unsigned char tmp[256] __attribute__((aligned(32)));
- BACKUP_REG;
- __asm__ volatile (
- ".set arch=loongson3a \n\t"
- "daddiu %[pPixCb], %[pPixCb], -0x2 \n\t"
- "daddiu %[pPixCr], %[pPixCr], -0x2 \n\t"
- "move $9, %[pPixCb] \n\t"
- "move $10, %[pPixCr] \n\t"
- "dsll $11, %[iStride], 0x2 \n\t"
- "daddu %[pPixCb], %[pPixCb], $11 \n\t"
- "daddu %[pPixCr], %[pPixCr], $11 \n\t"
- "daddiu $11, %[tmp], 0x80 \n\t"
- "gsldlc1 $f0, 0x7($9) \n\t"
- "gsldrc1 $f0, 0x0($9) \n\t"
- "daddu $12, $9, %[iStride] \n\t"
- "gsldlc1 $f4, 0x7($12) \n\t"
- "gsldrc1 $f4, 0x0($12) \n\t"
- "daddu $12, $12, %[iStride] \n\t"
- "gsldlc1 $f8, 0x7($12) \n\t"
- "gsldrc1 $f8, 0x0($12) \n\t"
- "daddu $12, $12, %[iStride] \n\t"
- "gsldlc1 $f12, 0x7($12) \n\t"
- "gsldlc1 $f16, 0x7($10) \n\t"
- "gsldrc1 $f12, 0x0($12) \n\t"
- "gsldrc1 $f16, 0x0($10) \n\t"
- "daddu $12, $10, %[iStride] \n\t"
- "gsldlc1 $f20, 0x7($12) \n\t"
- "gsldrc1 $f20, 0x0($12) \n\t"
- "daddu $12, $12, %[iStride] \n\t"
- "gsldlc1 $f24, 0x7($12) \n\t"
- "gsldrc1 $f24, 0x0($12) \n\t"
- "daddu $12, $12, %[iStride] \n\t"
- "gsldlc1 $f28, 0x7($12) \n\t"
- "gsldrc1 $f28, 0x0($12) \n\t"
- "punpcklwd $f0, $f0, $f16 \n\t"
- "punpcklwd $f4, $f4, $f20 \n\t"
- "punpcklwd $f8, $f8, $f24 \n\t"
- "punpcklwd $f12, $f12, $f28 \n\t"
- "gsldlc1 $f16, 0x7(%[pPixCb]) \n\t"
- "gsldlc1 $f20, 0x7(%[pPixCr]) \n\t"
- "gsldrc1 $f16, 0x0(%[pPixCb]) \n\t"
- "gsldrc1 $f20, 0x0(%[pPixCr]) \n\t"
- "punpcklwd $f16, $f16, $f20 \n\t"
- "mov.d $f2, $f16 \n\t"
- "daddu $12, %[pPixCb], %[iStride] \n\t"
- "daddu $13, %[pPixCr], %[iStride] \n\t"
- "gsldlc1 $f16, 0x7($12) \n\t"
- "gsldlc1 $f20, 0x7($13) \n\t"
- "gsldrc1 $f16, 0x0($12) \n\t"
- "gsldrc1 $f20, 0x0($13) \n\t"
- "punpcklwd $f16, $f16, $f20 \n\t"
- "mov.d $f6, $f16 \n\t"
- "daddu $12, $12, %[iStride] \n\t"
- "daddu $13, $13, %[iStride] \n\t"
- "gsldlc1 $f16, 0x7($12) \n\t"
- "gsldlc1 $f20, 0x7($13) \n\t"
- "gsldrc1 $f16, 0x0($12) \n\t"
- "gsldrc1 $f20, 0x0($13) \n\t"
- "punpcklwd $f16, $f16, $f20 \n\t"
- "mov.d $f10, $f16 \n\t"
- "daddu $12, $12, %[iStride] \n\t"
- "daddu $13, $13, %[iStride] \n\t"
- "gsldlc1 $f16, 0x7($12) \n\t"
- "gsldlc1 $f20, 0x7($13) \n\t"
- "gsldrc1 $f16, 0x0($12) \n\t"
- "gsldrc1 $f20, 0x0($13) \n\t"
- "punpcklwd $f16, $f16, $f20 \n\t"
- "mov.d $f14, $f16 \n\t"
- "punpcklbh $f24, $f2, $f6 \n\t"
- "punpckhbh $f26, $f2, $f6 \n\t"
- "punpckhbh $f2, $f0, $f4 \n\t"
- "punpcklbh $f0, $f0, $f4 \n\t"
- "punpcklbh $f28, $f10, $f14 \n\t"
- "punpckhbh $f30, $f10, $f14 \n\t"
- "punpckhbh $f10, $f8, $f12 \n\t"
- "punpcklbh $f8, $f8, $f12 \n\t"
- "punpcklhw $f16, $f2, $f10 \n\t"
- "punpckhhw $f18, $f2, $f10 \n\t"
- "punpckhhw $f2, $f0, $f8 \n\t"
- "punpcklhw $f0, $f0, $f8 \n\t"
- "punpcklhw $f20, $f26, $f30 \n\t"
- "punpckhhw $f22, $f26, $f30 \n\t"
- "punpckhhw $f26, $f24, $f28 \n\t"
- "punpcklhw $f24, $f24, $f28 \n\t"
- "punpcklwd $f4, $f2, $f26 \n\t"
- "punpckhwd $f6, $f2, $f26 \n\t"
- "punpckhwd $f2, $f0, $f24 \n\t"
- "punpcklwd $f0, $f0, $f24 \n\t"
- "punpcklwd $f8, $f18, $f22 \n\t"
- "punpckhwd $f10, $f18, $f22 \n\t"
- "punpckhwd $f18, $f16, $f20 \n\t"
- "punpcklwd $f16, $f16, $f20 \n\t"
- "mov.d $f20, $f2 \n\t"
- "mov.d $f22, $f18 \n\t"
- "mov.d $f2, $f16 \n\t"
- "mov.d $f24, $f6 \n\t"
- "mov.d $f26, $f10 \n\t"
- "mov.d $f6, $f8 \n\t"
- "gssqc1 $f2, $f0, 0x0($11) \n\t"
- "gssqc1 $f22, $f20, 0x10($11) \n\t"
- "gssqc1 $f6, $f4, 0x20($11) \n\t"
- "gssqc1 $f26, $f24, 0x30($11) \n\t"
- "gslqc1 $f26, $f24, 0x80(%[tmp]) \n\t"
- "gslqc1 $f18, $f16, 0x90(%[tmp]) \n\t"
- "gslqc1 $f22, $f20, 0xa0(%[tmp]) \n\t"
- "gslqc1 $f30, $f28, 0xb0(%[tmp]) \n\t"
- "xor $f0, $f0, $f0 \n\t"
- "dmtc1 %[iAlpha], $f4 \n\t"
- "punpcklhw $f8, $f4, $f4 \n\t"
- "punpcklwd $f4, $f8, $f8 \n\t"
- "mov.d $f6, $f4 \n\t"
- "dmtc1 %[iBeta], $f8 \n\t"
- "punpcklhw $f12, $f8, $f8 \n\t"
- "punpcklwd $f8, $f12, $f12 \n\t"
- "mov.d $f10, $f8 \n\t"
- "mov.d $f12, $f24 \n\t"
- "punpcklbh $f24, $f26, $f0 \n\t"
- "punpckhbh $f26, $f26, $f0 \n\t"
- "gssqc1 $f26, $f24, 0x60(%[tmp]) \n\t"
- "gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
- "punpcklbh $f24, $f26, $f0 \n\t"
- "punpckhbh $f26, $f26, $f0 \n\t"
- "gssqc1 $f26, $f24, 0x30(%[tmp]) \n\t"
- "gslqc1 $f26, $f24, 0xa0(%[tmp]) \n\t"
- "punpcklbh $f24, $f26, $f0 \n\t"
- "punpckhbh $f26, $f26, $f0 \n\t"
- "gssqc1 $f26, $f24, 0x40(%[tmp]) \n\t"
- "gslqc1 $f26, $f24, 0xb0(%[tmp]) \n\t"
- "punpcklbh $f24, $f26, $f0 \n\t"
- "punpckhbh $f26, $f26, $f0 \n\t"
- "gssqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
- "punpckhbh $f30, $f28, $f0 \n\t"
- "punpcklbh $f28, $f28, $f0 \n\t"
- "punpckhbh $f18, $f16, $f0 \n\t"
- "punpcklbh $f16, $f16, $f0 \n\t"
- "punpckhbh $f22, $f20, $f0 \n\t"
- "punpcklbh $f20, $f20, $f0 \n\t"
- "punpckhbh $f14, $f12, $f0 \n\t"
- "punpcklbh $f12, $f12, $f0 \n\t"
- "gssqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
- "psubh $f24, $f16, $f20 \n\t"
- "psubh $f26, $f18, $f22 \n\t"
- WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
- "pcmpgth $f0, $f4, $f24 \n\t"
- "pcmpgth $f2, $f6, $f26 \n\t"
- "psubh $f24, $f12, $f16 \n\t"
- "psubh $f26, $f14, $f18 \n\t"
- WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
- "pcmpgth $f28, $f8, $f24 \n\t"
- "pcmpgth $f30, $f10, $f26 \n\t"
- "gslqc1 $f26, $f24, 0x50(%[tmp]) \n\t"
- "psubh $f24, $f24, $f20 \n\t"
- "psubh $f26, $f26, $f22 \n\t"
- "and $f0, $f0, $f28 \n\t"
- "and $f2, $f2, $f30 \n\t"
- WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
- "dmfc1 %[iAlpha], $f20 \n\t"
- "dmfc1 %[iBeta], $f22 \n\t"
- "pcmpgth $f28, $f8, $f24 \n\t"
- "pcmpgth $f30, $f10, $f26 \n\t"
- "gslqc1 $f26, $f24, 0x30(%[tmp]) \n\t"
- "gslqc1 $f22, $f20, 0x40(%[tmp]) \n\t"
- "psubh $f24, $f24, $f20 \n\t"
- "psubh $f26, $f26, $f22 \n\t"
- WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
- "pcmpgth $f4, $f4, $f24 \n\t"
- "pcmpgth $f6, $f6, $f26 \n\t"
- "gslqc1 $f26, $f24, 0x60(%[tmp]) \n\t"
- "gslqc1 $f22, $f20, 0x30(%[tmp]) \n\t"
- "psubh $f24, $f24, $f20 \n\t"
- "psubh $f26, $f26, $f22 \n\t"
- WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
- "and $f0, $f0, $f28 \n\t"
- "and $f2, $f2, $f30 \n\t"
- "pcmpgth $f28, $f8, $f24 \n\t"
- "pcmpgth $f30, $f10, $f26 \n\t"
- "gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
- "gslqc1 $f22, $f20, 0x40(%[tmp]) \n\t"
- "psubh $f24, $f24, $f20 \n\t"
- "psubh $f26, $f26, $f22 \n\t"
- WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
- "dli $8, 0x2 \n\t"
- "and $f4, $f4, $f28 \n\t"
- "and $f6, $f6, $f30 \n\t"
- "pcmpgth $f8, $f8, $f24 \n\t"
- "pcmpgth $f10, $f10, $f26 \n\t"
- "and $f4, $f4, $f8 \n\t"
- "and $f6, $f6, $f10 \n\t"
- "dmtc1 $8, $f8 \n\t"
- "punpcklhw $f24, $f8, $f8 \n\t"
- "punpcklwd $f8, $f24, $f24 \n\t"
- "mov.d $f10, $f8 \n\t"
- "gssqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
- "paddh $f8, $f12, $f12 \n\t"
- "paddh $f10, $f14, $f14 \n\t"
- "paddh $f8, $f8, $f16 \n\t"
- "paddh $f10, $f10, $f18 \n\t"
- "gslqc1 $f22, $f20, 0x50(%[tmp]) \n\t"
- "paddh $f8, $f8, $f20 \n\t"
- "paddh $f10, $f10, $f22 \n\t"
- "gslqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
- "paddh $f8, $f8, $f24 \n\t"
- "paddh $f10, $f10, $f26 \n\t"
- "dmtc1 $8, $f20 \n\t"
- "psrah $f8, $f8, $f20 \n\t"
- "psrah $f10, $f10, $f20 \n\t"
- "and $f24, $f0, $f8 \n\t"
- "and $f26, $f2, $f10 \n\t"
- "pandn $f8, $f0, $f16 \n\t"
- "pandn $f10, $f2, $f18 \n\t"
- "or $f24, $f24, $f8 \n\t"
- "or $f26, $f26, $f10 \n\t"
- "gslqc1 $f10, $f8, 0x60(%[tmp]) \n\t"
- "paddh $f28, $f8, $f8 \n\t"
- "paddh $f30, $f10, $f10 \n\t"
- "gslqc1 $f22, $f20, 0x30(%[tmp]) \n\t"
- "paddh $f28, $f28, $f20 \n\t"
- "paddh $f30, $f30, $f22 \n\t"
- "gslqc1 $f18, $f16, 0x70(%[tmp]) \n\t"
- "paddh $f28, $f28, $f16 \n\t"
- "paddh $f30, $f30, $f18 \n\t"
- "gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
- "paddh $f28, $f28, $f8 \n\t"
- "paddh $f30, $f30, $f10 \n\t"
- "pandn $f8, $f4, $f20 \n\t"
- "pandn $f10, $f6, $f22 \n\t"
- "dmtc1 $8, $f20 \n\t"
- "psrah $f28, $f28, $f20 \n\t"
- "psrah $f30, $f30, $f20 \n\t"
- "and $f16, $f4, $f28 \n\t"
- "and $f18, $f6, $f30 \n\t"
- "or $f16, $f16, $f8 \n\t"
- "or $f18, $f18, $f10 \n\t"
- "gslqc1 $f10, $f8, 0x50(%[tmp]) \n\t"
- "packushb $f24, $f24, $f26 \n\t"
- "packushb $f26, $f16, $f18 \n\t"
- "gssqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
- "paddh $f24, $f8, $f8 \n\t"
- "paddh $f26, $f10, $f10 \n\t"
- "dmtc1 %[iAlpha], $f20 \n\t"
- "dmtc1 %[iBeta], $f22 \n\t"
- "gslqc1 $f10, $f8, 0x20(%[tmp]) \n\t"
- "paddh $f24, $f24, $f20 \n\t"
- "paddh $f26, $f26, $f22 \n\t"
- "paddh $f24, $f24, $f12 \n\t"
- "paddh $f26, $f26, $f14 \n\t"
- "mov.d $f16, $f0 \n\t"
- "mov.d $f18, $f2 \n\t"
- "pandn $f0, $f0, $f20 \n\t"
- "pandn $f2, $f2, $f22 \n\t"
- "dmtc1 $8, $f20 \n\t"
- "paddh $f24, $f24, $f8 \n\t"
- "paddh $f26, $f26, $f10 \n\t"
- "psrah $f24, $f24, $f20 \n\t"
- "psrah $f26, $f26, $f20 \n\t"
- "and $f16, $f16, $f24 \n\t"
- "and $f18, $f18, $f26 \n\t"
- "or $f16, $f16, $f0 \n\t"
- "or $f18, $f18, $f2 \n\t"
- "gslqc1 $f2, $f0, 0x70(%[tmp]) \n\t"
- "paddh $f20, $f0, $f0 \n\t"
- "paddh $f22, $f2, $f2 \n\t"
- "gslqc1 $f2, $f0, 0x40(%[tmp]) \n\t"
- "paddh $f20, $f20, $f0 \n\t"
- "paddh $f22, $f22, $f2 \n\t"
- "gslqc1 $f14, $f12, 0x60(%[tmp]) \n\t"
- "paddh $f20, $f20, $f12 \n\t"
- "paddh $f22, $f22, $f14 \n\t"
- "paddh $f20, $f20, $f8 \n\t"
- "paddh $f22, $f22, $f10 \n\t"
- "dmtc1 $8, $f8 \n\t"
- "psrah $f20, $f20, $f8 \n\t"
- "psrah $f22, $f22, $f8 \n\t"
- "and $f12, $f4, $f20 \n\t"
- "and $f14, $f6, $f22 \n\t"
- "pandn $f4, $f4, $f0 \n\t"
- "pandn $f6, $f6, $f2 \n\t"
- "or $f12, $f12, $f4 \n\t"
- "or $f14, $f14, $f6 \n\t"
- "packushb $f16, $f16, $f18 \n\t"
- "packushb $f18, $f12, $f14 \n\t"
- "gssqc1 $f18, $f16, 0xa0(%[tmp]) \n\t"
- "gslqc1 $f2, $f0, 0x0($11) \n\t"
- "gslqc1 $f6, $f4, 0x10($11) \n\t"
- "gslqc1 $f10, $f8, 0x20($11) \n\t"
- "gslqc1 $f14, $f12, 0x30($11) \n\t"
- "mov.d $f26, $f2 \n\t"
- "punpckhbh $f2, $f0, $f4 \n\t"
- "punpcklbh $f0, $f0, $f4 \n\t"
- "punpcklbh $f24, $f26, $f6 \n\t"
- "punpckhbh $f26, $f26, $f6 \n\t"
- "mov.d $f30, $f10 \n\t"
- "punpckhbh $f10, $f8, $f12 \n\t"
- "punpcklbh $f8, $f8, $f12 \n\t"
- "punpcklbh $f28, $f30, $f14 \n\t"
- "punpckhbh $f30, $f30, $f14 \n\t"
- "punpcklhw $f16, $f2, $f10 \n\t"
- "punpckhhw $f18, $f2, $f10 \n\t"
- "punpcklhw $f20, $f26, $f30 \n\t"
- "punpckhhw $f22, $f26, $f30 \n\t"
- "punpckhhw $f2, $f0, $f8 \n\t"
- "punpcklhw $f0, $f0, $f8 \n\t"
- "punpckhhw $f26, $f24, $f28 \n\t"
- "punpcklhw $f24, $f24, $f28 \n\t"
- "punpcklwd $f4, $f2, $f26 \n\t"
- "punpckhwd $f6, $f2, $f26 \n\t"
- "punpcklwd $f8, $f18, $f22 \n\t"
- "punpckhwd $f10, $f18, $f22 \n\t"
- "punpckhwd $f2, $f0, $f24 \n\t"
- "punpcklwd $f0, $f0, $f24 \n\t"
- "punpckhwd $f18, $f16, $f20 \n\t"
- "punpcklwd $f16, $f16, $f20 \n\t"
- "mov.d $f20, $f2 \n\t"
- "mov.d $f24, $f6 \n\t"
- "mov.d $f2, $f16 \n\t"
- "mov.d $f22, $f18 \n\t"
- "mov.d $f6, $f8 \n\t"
- "mov.d $f26, $f10 \n\t"
- "dli %[iAlpha], 0x20 \n\t"
- "dmtc1 %[iAlpha], $f8 \n\t"
- "gsswlc1 $f0, 0x3($9) \n\t"
- "gsswrc1 $f0, 0x0($9) \n\t"
- "daddu $12, $9, %[iStride] \n\t"
- "gsswlc1 $f20, 0x3($12) \n\t"
- "gsswrc1 $f20, 0x0($12) \n\t"
- "daddu $12, $12, %[iStride] \n\t"
- "gsswlc1 $f4, 0x3($12) \n\t"
- "gsswrc1 $f4, 0x0($12) \n\t"
- "daddu $12, $12, %[iStride] \n\t"
- "gsswlc1 $f24, 0x3($12) \n\t"
- "gsswrc1 $f24, 0x0($12) \n\t"
- "dsrl $f0, $f0, $f8 \n\t"
- "dsrl $f20, $f20, $f8 \n\t"
- "dsrl $f4, $f4, $f8 \n\t"
- "dsrl $f24, $f24, $f8 \n\t"
- "gsswlc1 $f0, 0x3($10) \n\t"
- "gsswrc1 $f0, 0x0($10) \n\t"
- "daddu $13, $10, %[iStride] \n\t"
- "daddu $8, $13, %[iStride] \n\t"
- "gsswlc1 $f20, 0x3($13) \n\t"
- "gsswrc1 $f20, 0x0($13) \n\t"
- "daddu $13, $8, %[iStride] \n\t"
- "gsswlc1 $f4, 0x3($8) \n\t"
- "gsswrc1 $f4, 0x0($8) \n\t"
- "gsswlc1 $f24, 0x3($13) \n\t"
- "gsswrc1 $f24, 0x0($13) \n\t"
- "gsswlc1 $f2, 0x3(%[pPixCb]) \n\t"
- "gsswrc1 $f2, 0x0(%[pPixCb]) \n\t"
- "daddu $12, %[pPixCb], %[iStride] \n\t"
- "gsswlc1 $f22, 0x3($12) \n\t"
- "gsswrc1 $f22, 0x0($12) \n\t"
- "daddu $12, $12, %[iStride] \n\t"
- "gsswlc1 $f6, 0x3($12) \n\t"
- "gsswrc1 $f6, 0x0($12) \n\t"
- "daddu $12, $12, %[iStride] \n\t"
- "gsswlc1 $f26, 0x3($12) \n\t"
- "gsswrc1 $f26, 0x0($12) \n\t"
- "dsrl $f2, $f2, $f8 \n\t"
- "dsrl $f22, $f22, $f8 \n\t"
- "dsrl $f6, $f6, $f8 \n\t"
- "dsrl $f26, $f26, $f8 \n\t"
- "gsswlc1 $f2, 0x3(%[pPixCr]) \n\t"
- "gsswrc1 $f2, 0x0(%[pPixCr]) \n\t"
- "daddu $13, %[pPixCr], %[iStride] \n\t"
- "daddu $8, $13, %[iStride] \n\t"
- "gsswlc1 $f22, 0x3($13) \n\t"
- "gsswrc1 $f22, 0x0($13) \n\t"
- "daddu $13, $8, %[iStride] \n\t"
- "gsswlc1 $f6, 0x3($8) \n\t"
- "gsswrc1 $f6, 0x0($8) \n\t"
- "gsswlc1 $f26, 0x3($13) \n\t"
- "gsswrc1 $f26, 0x0($13) \n\t"
- : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
- : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
- [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
- : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
- "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
- "$f24", "$f26", "$f28", "$f30"
- );
- RECOVER_REG;
-}
-
-void DeblockChromaLt4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
- int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
- unsigned char tmp[320] __attribute__((aligned(32)));
- BACKUP_REG;
- __asm__ volatile (
- ".set arch=loongson3a \n\t"
- "daddiu %[pPixCb], %[pPixCb], -0x2 \n\t"
- "daddiu %[pPixCr], %[pPixCr], -0x2 \n\t"
- "daddu $8, %[pPixCb], %[iStride] \n\t"
- "gsldlc1 $f0, 0x7(%[pPixCb]) \n\t"
- "gsldlc1 $f4, 0x7($8) \n\t"
- "gsldrc1 $f0, 0x0(%[pPixCb]) \n\t"
- "gsldrc1 $f4, 0x0($8) \n\t"
- "daddu $9, $8, %[iStride] \n\t"
- "daddu $8, $9, %[iStride] \n\t"
- "gsldlc1 $f8, 0x7($9) \n\t"
- "gsldlc1 $f12, 0x7($8) \n\t"
- "gsldrc1 $f8, 0x0($9) \n\t"
- "gsldrc1 $f12, 0x0($8) \n\t"
- "daddu $9, $8, %[iStride] \n\t"
-
- "daddu $10, %[pPixCr], %[iStride] \n\t"
- "gsldlc1 $f16, 0x7(%[pPixCr]) \n\t"
- "gsldlc1 $f20, 0x7($10) \n\t"
- "gsldrc1 $f16, 0x0(%[pPixCr]) \n\t"
- "gsldrc1 $f20, 0x0($10) \n\t"
- "daddu $11, $10, %[iStride] \n\t"
- "daddu $10, $11, %[iStride] \n\t"
- "gsldlc1 $f24, 0x7($11) \n\t"
- "gsldlc1 $f28, 0x7($10) \n\t"
- "gsldrc1 $f24, 0x0($11) \n\t"
- "gsldrc1 $f28, 0x0($10) \n\t"
- "daddu $11, $10, %[iStride] \n\t"
-
- "punpcklwd $f0, $f0, $f16 \n\t"
- "punpcklwd $f4, $f4, $f20 \n\t"
- "punpcklwd $f8, $f8, $f24 \n\t"
- "punpcklwd $f12, $f12, $f28 \n\t"
- "gsldlc1 $f16, 0x7($9) \n\t"
- "gsldlc1 $f20, 0x7($11) \n\t"
- "gsldrc1 $f16, 0x0($9) \n\t"
- "gsldrc1 $f20, 0x0($11) \n\t"
- "punpcklwd $f16, $f16, $f20 \n\t"
- "mov.d $f2, $f16 \n\t"
- "daddu $8, $9, %[iStride] \n\t"
- "daddu $10, $11, %[iStride] \n\t"
- "gsldlc1 $f16, 0x7($8) \n\t"
- "gsldlc1 $f20, 0x7($10) \n\t"
- "gsldrc1 $f16, 0x0($8) \n\t"
- "gsldrc1 $f20, 0x0($10) \n\t"
- "punpcklwd $f16, $f16, $f20 \n\t"
- "mov.d $f6, $f16 \n\t"
- "daddu $9, $8, %[iStride] \n\t"
- "daddu $11, $10, %[iStride] \n\t"
-
- "gsldlc1 $f16, 0x7($9) \n\t"
- "gsldlc1 $f20, 0x7($11) \n\t"
- "gsldrc1 $f16, 0x0($9) \n\t"
- "gsldrc1 $f20, 0x0($11) \n\t"
- "punpcklwd $f16, $f16, $f20 \n\t"
- "mov.d $f10, $f16 \n\t"
- "daddu $8, $9, %[iStride] \n\t"
- "daddu $10, $11, %[iStride] \n\t"
-
- "gsldlc1 $f16, 0x7($8) \n\t"
- "gsldlc1 $f20, 0x7($10) \n\t"
- "gsldrc1 $f16, 0x0($8) \n\t"
- "gsldrc1 $f20, 0x0($10) \n\t"
- "punpcklwd $f16, $f16, $f20 \n\t"
- "mov.d $f14, $f16 \n\t"
-
- "punpcklbh $f24, $f2, $f6 \n\t"
- "punpckhbh $f26, $f2, $f6 \n\t"
- "punpckhbh $f2, $f0, $f4 \n\t"
- "punpcklbh $f0, $f0, $f4 \n\t"
- "punpcklbh $f28, $f10, $f14 \n\t"
- "punpckhbh $f30, $f10, $f14 \n\t"
- "punpckhbh $f10, $f8, $f12 \n\t"
- "punpcklbh $f8, $f8, $f12 \n\t"
-
- "punpcklhw $f16, $f2, $f10 \n\t"
- "punpckhhw $f18, $f2, $f10 \n\t"
- "punpckhhw $f2, $f0, $f8 \n\t"
- "punpcklhw $f0, $f0, $f8 \n\t"
- "punpcklhw $f20, $f26, $f30 \n\t"
- "punpckhhw $f22, $f26, $f30 \n\t"
- "punpckhhw $f26, $f24, $f28 \n\t"
- "punpcklhw $f24, $f24, $f28 \n\t"
-
- "punpcklwd $f4, $f2, $f26 \n\t"
- "punpckhwd $f6, $f2, $f26 \n\t"
- "punpckhwd $f2, $f0, $f24 \n\t"
- "punpcklwd $f0, $f0, $f24 \n\t"
- "punpcklwd $f8, $f18, $f22 \n\t"
- "punpckhwd $f10, $f18, $f22 \n\t"
- "punpckhwd $f18, $f16, $f20 \n\t"
- "punpcklwd $f16, $f16, $f20 \n\t"
-
- "mov.d $f20, $f2 \n\t"
- "mov.d $f22, $f18 \n\t"
- "mov.d $f2, $f16 \n\t"
- "mov.d $f24, $f6 \n\t"
- "mov.d $f26, $f10 \n\t"
- "mov.d $f6, $f8 \n\t"
- "daddiu $11, %[tmp], 0x70 \n\t"
-
- "gssqc1 $f2, $f0, 0x0($11) \n\t"
- "gssqc1 $f22, $f20, 0x10($11) \n\t"
- "gssqc1 $f6, $f4, 0x20($11) \n\t"
- "gssqc1 $f26, $f24, 0x30($11) \n\t"
-
- "lb $8, 0x3(%[pTC]) \n\t"
- "lb $9, 0x2(%[pTC]) \n\t"
- "lb $10, 0x1(%[pTC]) \n\t"
- "lb $11, 0x0(%[pTC]) \n\t"
-
- "and $12, $8, 0xFFFF \n\t"
- "dmtc1 $12, $f8 \n\t"
-
- "and $9, $9, 0xFFFF \n\t"
- "dmtc1 $9, $f12 \n\t"
- "mov.d $f16, $f12 \n\t"
-
- "and $9, $10, 0xFFFF \n\t"
- "dmtc1 $9, $f20 \n\t"
- "xor $f0, $f0, $f0 \n\t"
- "mov.d $f24, $f20 \n\t"
- "and $9, $11, 0xFFFF \n\t"
- "punpcklhw $f24, $f24, $f8 \n\t"
-
- "mov.d $f4, $f8 \n\t"
- "dmtc1 $9, $f28 \n\t"
- "mov.d $f0, $f28 \n\t"
-
- "punpcklhw $f28, $f28, $f12 \n\t"
- "punpcklhw $f20, $f20, $f4 \n\t"
- "xor $f4, $f4, $f4 \n\t"
- "xor $f6, $f6, $f6 \n\t"
- "punpcklhw $f28, $f28, $f20 \n\t"
- "gslqc1 $f22, $f20, 0xA0(%[tmp]) \n\t"
- "punpcklhw $f0, $f0, $f16 \n\t"
- "punpcklhw $f0, $f0, $f24 \n\t"
-
- "gslqc1 $f26, $f24, 0x70(%[tmp]) \n\t"
- "punpckhhw $f2, $f0, $f28 \n\t"
- "punpcklhw $f0, $f0, $f28 \n\t"
- "gslqc1 $f30, $f28, 0x80(%[tmp]) \n\t"
- "psubh $f8, $f4, $f0 \n\t"
- "psubh $f10, $f6, $f2 \n\t"
- "gssqc1 $f10, $f8, 0xD0(%[tmp]) \n\t"
- "dmtc1 %[iAlpha], $f8 \n\t"
- "punpcklhw $f12, $f8, $f8 \n\t"
- "punpcklwd $f16, $f12, $f12 \n\t"
- "mov.d $f18, $f16 \n\t"
-
- "dmtc1 %[iBeta], $f8 \n\t"
- "punpcklhw $f12, $f8, $f8 \n\t"
- "punpcklwd $f8, $f12, $f12 \n\t"
- "mov.d $f10, $f8 \n\t"
-
- "gslqc1 $f14, $f12, 0x90(%[tmp]) \n\t"
- "gssqc1 $f10, $f8, 0x50(%[tmp]) \n\t"
- "punpckhbh $f10, $f24, $f4 \n\t"
- "punpcklbh $f8, $f24, $f4 \n\t"
- "punpcklbh $f24, $f26, $f6 \n\t"
- "punpckhbh $f26, $f26, $f6 \n\t"
-
- "gssqc1 $f10, $f8, 0x40(%[tmp]) \n\t"
- "gssqc1 $f26, $f24, 0xB0(%[tmp]) \n\t"
- "gslqc1 $f26, $f24, 0x90(%[tmp]) \n\t"
- "punpcklbh $f8, $f28, $f4 \n\t"
- "punpckhbh $f10, $f28, $f4 \n\t"
- "punpcklbh $f28, $f30, $f6 \n\t"
- "punpckhbh $f30, $f30, $f6 \n\t"
- "punpcklbh $f24, $f26, $f6 \n\t"
- "punpckhbh $f26, $f26, $f6 \n\t"
- "punpckhbh $f14, $f12, $f4 \n\t"
- "punpcklbh $f12, $f12, $f4 \n\t"
- "punpckhbh $f22, $f20, $f4 \n\t"
- "punpcklbh $f20, $f20, $f4 \n\t"
- "gssqc1 $f30, $f28, 0xF0(%[tmp]) \n\t"
- "gssqc1 $f26, $f24, 0xC0(%[tmp]) \n\t"
- "gslqc1 $f26, $f24, 0xA0(%[tmp]) \n\t"
- "punpcklbh $f24, $f26, $f6 \n\t"
- "punpckhbh $f26, $f26, $f6 \n\t"
-
- "dli $13, 0x4 \n\t"
- "gssqc1 $f26, $f24, 0xE0(%[tmp]) \n\t"
- "dmtc1 $13, $f24 \n\t"
- "punpcklhw $f28, $f24, $f24 \n\t"
- "punpcklwd $f24, $f28, $f28 \n\t"
- "mov.d $f26, $f24 \n\t"
- "dli $12, 0x2 \n\t"
- "dli $13, 0x3 \n\t"
-
- "gssqc1 $f2, $f0, 0x20(%[tmp]) \n\t"
- "dmfc1 %[iAlpha], $f0 \n\t"
- "dmfc1 %[iBeta], $f2 \n\t"
- "gssqc1 $f26, $f24, 0x30(%[tmp]) \n\t"
- "gslqc1 $f30, $f28, 0x40(%[tmp]) \n\t"
- "psubh $f28, $f28, $f20 \n\t"
- "psubh $f30, $f30, $f22 \n\t"
- "pcmpgth $f24, $f0, $f4 \n\t"
- "pcmpgth $f26, $f2, $f6 \n\t"
-
- "dmtc1 $12, $f0 \n\t"
- "dmtc1 $13, $f2 \n\t"
- "gssqc1 $f26, $f24, 0x60(%[tmp]) \n\t"
- "gslqc1 $f6, $f4, 0xD0(%[tmp]) \n\t"
- "psubh $f24, $f12, $f8 \n\t"
- "psubh $f26, $f14, $f10 \n\t"
- "psllh $f24, $f24, $f0 \n\t"
- "psllh $f26, $f26, $f0 \n\t"
- "paddh $f24, $f24, $f28 \n\t"
- "paddh $f26, $f26, $f30 \n\t"
- "gslqc1 $f30, $f28, 0x30(%[tmp]) \n\t"
- "paddh $f24, $f24, $f28 \n\t"
- "paddh $f26, $f26, $f30 \n\t"
- "psrah $f24, $f24, $f2 \n\t"
- "psrah $f26, $f26, $f2 \n\t"
- "pmaxsh $f4, $f4, $f24 \n\t"
- "pmaxsh $f6, $f6, $f26 \n\t"
-
- "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
- "gslqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
- "pminsh $f24, $f24, $f4 \n\t"
- "pminsh $f26, $f26, $f6 \n\t"
-
- "gssqc1 $f26, $f24, 0x20(%[tmp]) \n\t"
- "psubh $f4, $f8, $f12 \n\t"
- "psubh $f6, $f10, $f14 \n\t"
- WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
- "pcmpgth $f24, $f16, $f4 \n\t"
- "pcmpgth $f26, $f18, $f6 \n\t"
- "gslqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
- "psubh $f4, $f4, $f8 \n\t"
- "psubh $f6, $f6, $f10 \n\t"
- WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
- "pcmpgth $f28, $f28, $f4 \n\t"
- "pcmpgth $f30, $f30, $f6 \n\t"
-
- "gslqc1 $f6, $f4, 0x50(%[tmp]) \n\t"
- "and $f24, $f24, $f28 \n\t"
- "and $f26, $f26, $f30 \n\t"
- "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
- "psubh $f20, $f20, $f12 \n\t"
- "psubh $f22, $f22, $f14 \n\t"
- WELS_AbsH($f20, $f22, $f20, $f22, $f0, $f2)
- "pcmpgth $f4, $f4, $f20 \n\t"
- "pcmpgth $f6, $f6, $f22 \n\t"
-
- "gslqc1 $f22, $f20, 0xB0(%[tmp]) \n\t"
- "gslqc1 $f2, $f0, 0xE0(%[tmp]) \n\t"
- "psubh $f20, $f20, $f0 \n\t"
- "psubh $f22, $f22, $f2 \n\t"
- "and $f24, $f24, $f4 \n\t"
- "and $f26, $f26, $f6 \n\t"
- "gslqc1 $f2, $f0, 0x60(%[tmp]) \n\t"
- "and $f24, $f24, $f0 \n\t"
- "and $f26, $f26, $f2 \n\t"
-
- "gslqc1 $f6, $f4, 0x20(%[tmp]) \n\t"
- "and $f4, $f4, $f24 \n\t"
- "and $f6, $f6, $f26 \n\t"
- "gslqc1 $f26, $f24, 0xC0(%[tmp]) \n\t"
- "gssqc1 $f6, $f4, 0x40(%[tmp]) \n\t"
- "gslqc1 $f6, $f4, 0xF0(%[tmp]) \n\t"
-
- "dmtc1 $12, $f0 \n\t"
- "psubh $f24, $f24, $f4 \n\t"
- "psubh $f26, $f26, $f6 \n\t"
- "psllh $f24, $f24, $f0 \n\t"
- "psllh $f26, $f26, $f0 \n\t"
- "paddh $f24, $f24, $f20 \n\t"
- "paddh $f26, $f26, $f22 \n\t"
- "gslqc1 $f2, $f0, 0x30(%[tmp]) \n\t"
- "paddh $f24, $f24, $f0 \n\t"
- "paddh $f26, $f26, $f2 \n\t"
- "dmtc1 %[iBeta], $f2 \n\t"
-
- "dmtc1 $13, $f0 \n\t"
- "gslqc1 $f22, $f20, 0xD0(%[tmp]) \n\t"
- "psrah $f24, $f24, $f0 \n\t"
- "psrah $f26, $f26, $f0 \n\t"
- "dmtc1 %[iAlpha], $f0 \n\t"
- "pmaxsh $f20, $f20, $f24 \n\t"
- "pmaxsh $f22, $f22, $f26 \n\t"
- "pminsh $f0, $f0, $f20 \n\t"
- "pminsh $f2, $f2, $f22 \n\t"
-
- "dmfc1 %[iAlpha], $f0 \n\t"
- "dmfc1 %[iBeta], $f2 \n\t"
- "gslqc1 $f22, $f20, 0xC0(%[tmp]) \n\t"
- "psubh $f24, $f4, $f20 \n\t"
- "psubh $f26, $f6, $f22 \n\t"
- WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
- "pcmpgth $f16, $f16, $f24 \n\t"
- "pcmpgth $f18, $f18, $f26 \n\t"
-
- "gslqc1 $f26, $f24, 0xB0(%[tmp]) \n\t"
- "psubh $f24, $f24, $f4 \n\t"
- "psubh $f26, $f26, $f6 \n\t"
- WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
- "pcmpgth $f28, $f28, $f24 \n\t"
- "pcmpgth $f30, $f30, $f26 \n\t"
-
- "gslqc1 $f26, $f24, 0xE0(%[tmp]) \n\t"
- "and $f16, $f16, $f28 \n\t"
- "and $f18, $f18, $f30 \n\t"
-
- "gslqc1 $f30, $f28, 0x50(%[tmp]) \n\t"
- "psubh $f24, $f24, $f20 \n\t"
- "psubh $f26, $f26, $f22 \n\t"
- WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
- "pcmpgth $f28, $f28, $f24 \n\t"
- "pcmpgth $f30, $f30, $f26 \n\t"
- "and $f16, $f16, $f28 \n\t"
- "and $f18, $f18, $f30 \n\t"
- "gslqc1 $f30, $f28, 0x60(%[tmp]) \n\t"
- "dmtc1 %[iAlpha], $f0 \n\t"
- "dmtc1 %[iBeta], $f2 \n\t"
- "and $f16, $f16, $f28 \n\t"
- "and $f18, $f18, $f30 \n\t"
- "and $f0, $f0, $f16 \n\t"
- "and $f2, $f2, $f18 \n\t"
-
- "gslqc1 $f18, $f16, 0x40(%[tmp]) \n\t"
- "paddh $f8, $f8, $f16 \n\t"
- "paddh $f10, $f10, $f18 \n\t"
- "paddh $f4, $f4, $f0 \n\t"
- "paddh $f6, $f6, $f2 \n\t"
- "psubh $f12, $f12, $f16 \n\t"
- "psubh $f14, $f14, $f18 \n\t"
- "psubh $f20, $f20, $f0 \n\t"
- "psubh $f22, $f22, $f2 \n\t"
- "packushb $f8, $f8, $f10 \n\t"
- "packushb $f10, $f4, $f6 \n\t"
- "packushb $f12, $f12, $f14 \n\t"
- "packushb $f14, $f20, $f22 \n\t"
-
- "gssqc1 $f10, $f8, 0x80(%[tmp]) \n\t"
- "gssqc1 $f14, $f12, 0x90(%[tmp]) \n\t"
- "daddiu $11, %[tmp], 0x70 \n\t"
-
- "gslqc1 $f2, $f0, 0x0($11) \n\t"
- "gslqc1 $f6, $f4, 0x10($11) \n\t"
- "gslqc1 $f10, $f8, 0x20($11) \n\t"
- "gslqc1 $f14, $f12, 0x30($11) \n\t"
-
- "punpcklbh $f24, $f2, $f6 \n\t"
- "punpckhbh $f26, $f2, $f6 \n\t"
- "punpckhbh $f2, $f0, $f4 \n\t"
- "punpcklbh $f0, $f0, $f4 \n\t"
-
- "punpcklbh $f28, $f10, $f14 \n\t"
- "punpckhbh $f30, $f10, $f14 \n\t"
- "punpckhbh $f10, $f8, $f12 \n\t"
- "punpcklbh $f8, $f8, $f12 \n\t"
-
- "punpcklhw $f16, $f2, $f10 \n\t"
- "punpckhhw $f18, $f2, $f10 \n\t"
- "punpckhhw $f2, $f0, $f8 \n\t"
- "punpcklhw $f0, $f0, $f8 \n\t"
- "punpcklhw $f20, $f26, $f30 \n\t"
- "punpckhhw $f22, $f26, $f30 \n\t"
- "punpckhhw $f26, $f24, $f28 \n\t"
- "punpcklhw $f24, $f24, $f28 \n\t"
-
- "punpcklwd $f4, $f2, $f26 \n\t"
- "punpckhwd $f6, $f2, $f26 \n\t"
- "punpckhwd $f2, $f0, $f24 \n\t"
- "punpcklwd $f0, $f0, $f24 \n\t"
- "punpcklwd $f8, $f18, $f22 \n\t"
- "punpckhwd $f10, $f18, $f22 \n\t"
- "punpckhwd $f18, $f16, $f20 \n\t"
- "punpcklwd $f16, $f16, $f20 \n\t"
-
- "mov.d $f20, $f2 \n\t"
- "mov.d $f22, $f18 \n\t"
- "mov.d $f2, $f16 \n\t"
- "mov.d $f24, $f6 \n\t"
- "mov.d $f26, $f10 \n\t"
- "mov.d $f6, $f8 \n\t"
-
- "dli %[iAlpha], 0x20 \n\t"
- "daddu $8, %[pPixCb], %[iStride] \n\t"
- "gsswlc1 $f0, 0x3(%[pPixCb]) \n\t"
- "gsswlc1 $f20, 0x3($8) \n\t"
- "gsswrc1 $f0, 0x0(%[pPixCb]) \n\t"
- "gsswrc1 $f20, 0x0($8) \n\t"
- "daddu $9, $8, %[iStride] \n\t"
- "daddu $8, $9, %[iStride] \n\t"
- "gsswlc1 $f4, 0x3($9) \n\t"
- "gsswlc1 $f24, 0x3($8) \n\t"
- "gsswrc1 $f4, 0x0($9) \n\t"
- "gsswrc1 $f24, 0x0($8) \n\t"
- "daddu $9, $8, %[iStride] \n\t"
- "dmtc1 %[iAlpha], $f8 \n\t"
-
- "dsrl $f0, $f0, $f8 \n\t"
- "dsrl $f20, $f20, $f8 \n\t"
- "dsrl $f4, $f4, $f8 \n\t"
- "dsrl $f24, $f24, $f8 \n\t"
- "daddu $10, %[pPixCr], %[iStride] \n\t"
- "gsswlc1 $f0, 0x3(%[pPixCr]) \n\t"
- "gsswlc1 $f20, 0x3($10) \n\t"
- "gsswrc1 $f0, 0x0(%[pPixCr]) \n\t"
- "gsswrc1 $f20, 0x0($10) \n\t"
- "daddu $11, $10, %[iStride] \n\t"
- "daddu $10, $11, %[iStride] \n\t"
- "gsswlc1 $f4, 0x3($11) \n\t"
- "gsswlc1 $f24, 0x3($10) \n\t"
- "gsswrc1 $f4, 0x0($11) \n\t"
- "gsswrc1 $f24, 0x0($10) \n\t"
- "daddu $11, $10, %[iStride] \n\t"
-
- "daddu $8, $9, %[iStride] \n\t"
- "gsswlc1 $f2, 0x3($9) \n\t"
- "gsswlc1 $f22, 0x3($8) \n\t"
- "gsswrc1 $f2, 0x0($9) \n\t"
- "gsswrc1 $f22, 0x0($8) \n\t"
- "daddu $9, $8, %[iStride] \n\t"
- "daddu $8, $9, %[iStride] \n\t"
- "gsswlc1 $f6, 0x3($9) \n\t"
- "gsswlc1 $f26, 0x3($8) \n\t"
- "gsswrc1 $f6, 0x0($9) \n\t"
- "gsswrc1 $f26, 0x0($8) \n\t"
-
- "dsrl $f2, $f2, $f8 \n\t"
- "dsrl $f22, $f22, $f8 \n\t"
- "dsrl $f6, $f6, $f8 \n\t"
- "dsrl $f26, $f26, $f8 \n\t"
- "daddu $10, $11, %[iStride] \n\t"
- "gsswlc1 $f2, 0x3($11) \n\t"
- "gsswlc1 $f22, 0x3($10) \n\t"
- "gsswrc1 $f2, 0x0($11) \n\t"
- "gsswrc1 $f22, 0x0($10) \n\t"
- "daddu $11, $10, %[iStride] \n\t"
- "daddu $10, $11, %[iStride] \n\t"
- "gsswlc1 $f6, 0x3($11) \n\t"
- "gsswlc1 $f26, 0x3($10) \n\t"
- "gsswrc1 $f6, 0x0($11) \n\t"
- "gsswrc1 $f26, 0x0($10) \n\t"
- : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
- : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
- [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp), [pTC]"r"((char *)pTC)
- : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
- "$f6", "$f8", "$f10", "$f12","$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
- "$f26", "$f28", "$f30"
- );
- RECOVER_REG;
-}
-
-void WelsNonZeroCount_mmi(int8_t *pNonZeroCount) {
- __asm__ volatile(
- ".set arch=loongson3a \n\t"
- "gsldlc1 $f0, 0x7(%[pNonZeroCount]) \n\t"
- "gsldlc1 $f2, 0xF(%[pNonZeroCount]) \n\t"
- "gsldlc1 $f4, 0x17(%[pNonZeroCount]) \n\t"
- "gsldrc1 $f4, 0x10(%[pNonZeroCount]) \n\t"
- "gsldrc1 $f0, 0x0(%[pNonZeroCount]) \n\t"
- "gsldrc1 $f2, 0x8(%[pNonZeroCount]) \n\t"
- "pcmpeqh $f8, $f8, $f8 \n\t"
- "dli $8, 0xF \n\t"
- "dmtc1 $8, $f6 \n\t"
- "psrlh $f8, $f8, $f6 \n\t"
- "packushb $f8, $f8, $f8 \n\t"
-
- "pminub $f0, $f0, $f8 \n\t"
- "pminub $f2, $f2, $f8 \n\t"
- "pminub $f4, $f4, $f8 \n\t"
- "gssdlc1 $f0, 0x7(%[pNonZeroCount]) \n\t"
- "gssdlc1 $f2, 0xF(%[pNonZeroCount]) \n\t"
- "gssdlc1 $f4, 0x17(%[pNonZeroCount]) \n\t"
- "gssdrc1 $f0, 0x0(%[pNonZeroCount]) \n\t"
- "gssdrc1 $f2, 0x8(%[pNonZeroCount]) \n\t"
- "gssdrc1 $f4, 0x10(%[pNonZeroCount]) \n\t"
- :
- : [pNonZeroCount] "r"((unsigned char *)pNonZeroCount)
- : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8"
- );
-}
--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -63,14 +63,14 @@
endif
OBJS += $(COMMON_OBJSARM64)
-COMMON_ASM_MIPS64_SRCS=\
- $(COMMON_SRCDIR)/mips64/deblock_mmi.c\
+COMMON_ASM_MIPS_SRCS=\
+ $(COMMON_SRCDIR)/mips/deblock_mmi.c\
-COMMON_OBJSMIPS64 += $(COMMON_ASM_MIPS64_SRCS:.c=.$(OBJ))
-ifeq ($(ASM_ARCH), mips64)
-COMMON_OBJS += $(COMMON_OBJSMIPS64)
+COMMON_OBJSMIPS += $(COMMON_ASM_MIPS_SRCS:.c=.$(OBJ))
+ifeq ($(ASM_ARCH), mips)
+COMMON_OBJS += $(COMMON_OBJSMIPS)
endif
-OBJS += $(COMMON_OBJSMIPS64)
+OBJS += $(COMMON_OBJSMIPS)
OBJS += $(COMMON_OBJS)
@@ -77,14 +77,14 @@
$(COMMON_SRCDIR)/%.$(OBJ): $(COMMON_SRCDIR)/%.cpp
$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(COMMON_CFLAGS) $(COMMON_INCLUDES) -c $(CXX_O) $<
+$(COMMON_SRCDIR)/%.$(OBJ): $(COMMON_SRCDIR)/%.c
+ $(QUIET_CC)$(CC) $(CFLAGS) $(INCLUDES) $(COMMON_CFLAGS) $(COMMON_INCLUDES) -c $(CXX_O) $<
+
$(COMMON_SRCDIR)/%.$(OBJ): $(COMMON_SRCDIR)/%.asm
$(QUIET_ASM)$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(COMMON_ASMFLAGS) $(COMMON_ASM_INCLUDES) -o $@ $<
$(COMMON_SRCDIR)/%.$(OBJ): $(COMMON_SRCDIR)/%.S
$(QUIET_CCAS)$(CCAS) $(CCASFLAGS) $(ASMFLAGS) $(INCLUDES) $(COMMON_CFLAGS) $(COMMON_INCLUDES) -c -o $@ $<
-
-$(COMMON_SRCDIR)/%.$(OBJ): $(COMMON_SRCDIR)/%.c
- $(QUIET_CC)$(CC) $(CFLAGS) $(ASMFLAGS) $(INCLUDES) $(COMMON_CFLAGS) $(COMMON_INCLUDES) -c -o $@ $<
$(LIBPREFIX)common.$(LIBSUFFIX): $(COMMON_OBJS)
$(QUIET)rm -f $@