shithub: openh264

Download patch

ref: 14d7bf0744bfb023544be31f853df71a2b7eebb4
parent: 4390f83cecc820beea75016b9aa74544d9d5d3e4
author: gxw <[email protected]>
date: Tue Aug 7 07:57:29 EDT 2018

Modify targets.mk generation method

Edit the build/mktargets.py instead of the targets.mk directly.
Rename codec/common/mips64 to codec/common/mips.

--- a/build/arch.mk
+++ b/build/arch.mk
@@ -31,10 +31,10 @@
 endif
 
 #for loongson
-ifneq ($(filter mips64, $(ARCH)),)
+ifneq ($(filter mips mips64, $(ARCH)),)
 ifeq ($(USE_ASM), Yes)
-ASM_ARCH = mips64
-ASMFLAGS += -I$(SRC_PATH)codec/common/mips64/
+ASM_ARCH = mips
+ASMFLAGS += -I$(SRC_PATH)codec/common/mips/
 LOONGSON3A = $(shell g++ -dM -E - < /dev/null | grep '_MIPS_TUNE ' | cut -f 3 -d " ")
 ifeq ($(LOONGSON3A), "loongson3a")
 CFLAGS += -DHAVE_MMI
--- a/build/mktargets.py
+++ b/build/mktargets.py
@@ -117,9 +117,16 @@
         arm64files.append(file)
     elif 'arm' in c:
         armfiles.append(file)
+mipsfiles = []
+for file in cfiles:
+  c = file.split('/')
+  if 'mips' in c:
+    mipsfiles.append(file)
+    cfiles.remove(file)
 
 
 
+
 f = open(OUTFILE, "w")
 f.write("%s_SRCDIR=%s\n"%(PREFIX, args.directory))
 
@@ -169,10 +176,21 @@
     f.write("endif\n")
     f.write("OBJS += $(%s_OBJSARM64)\n\n"%(PREFIX))
 
+if len(mipsfiles) > 0:
+  f.write("%s_ASM_MIPS_SRCS=\\\n"%(PREFIX))
+  for c in mipsfiles:
+    f.write("\t$(%s_SRCDIR)/%s\\\n"%(PREFIX, c))
+  f.write("\n")
+  f.write("%s_OBJSMIPS += $(%s_ASM_MIPS_SRCS:.c=.$(OBJ))\n"%(PREFIX, PREFIX))
+  f.write("ifeq ($(ASM_ARCH), mips)\n")
+  f.write("%s_OBJS += $(%s_OBJSMIPS)\n"%(PREFIX,PREFIX))
+  f.write("endif\n")
+  f.write("OBJS += $(%s_OBJSMIPS)\n\n"%(PREFIX))
+
 f.write("OBJS += $(%s_OBJS)\n\n"%(PREFIX))
 write_cpp_rule_pattern(f)
 
-if len(cfiles) > 0:
+if len(cfiles) > 0 or len(mipsfiles) > 0:
     write_c_rule_pattern(f)
 
 if len(asm) > 0:
--- /dev/null
+++ b/codec/common/mips/deblock_mmi.c
@@ -1,0 +1,2826 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2018, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file    deblock_mmi.c
+ *
+ * \brief   Loongson optimize
+ *
+ * \date    20/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+void DeblockLumaLt4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
+                         int32_t iBeta, int8_t *pTC) {
+  unsigned char tmp[512] __attribute__((aligned(32)));
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "dsll       $8, %[iStride], 0x1                       \n\t"
+    "daddu      $8, $8, %[iStride]                        \n\t"
+    "dsubu      $14, %[pPix], $8                          \n\t"
+
+    "dsll       $8, %[iStride], 0x1                       \n\t"
+    "dsubu      $9, %[pPix], $8                           \n\t"
+
+    "dmtc1      %[iAlpha], $f0                            \n\t"
+    "dsubu      $13, %[pPix], %[iStride]                  \n\t"
+    "daddu      %[iStride], %[iStride], %[pPix]           \n\t"
+    "daddu      $12, $8, %[pPix]                          \n\t"
+
+    "punpcklhw  $f0, $f0, $f0                             \n\t"
+    "lb         $8, 0x0(%[pTC])                           \n\t"
+    "punpcklwd  $f0, $f0, $f0                             \n\t"
+    "mov.d      $f2, $f0                                  \n\t"
+    "gssqc1     $f2, $f0, 432-112(%[tmp])                 \n\t"
+    "dmtc1      %[iBeta], $f0                             \n\t"
+    "lb         %[iAlpha], 0x1(%[pTC])                    \n\t"
+    "dli        %[iBeta], 0xFFFF                          \n\t"
+    "punpcklhw  $f0, $f0, $f0                             \n\t"
+    "and        $10, %[iAlpha], %[iBeta]                  \n\t"
+    "punpcklwd  $f0, $f0, $f0                             \n\t"
+    "mov.d      $f2, $f0                                  \n\t"
+    "and        %[iAlpha], %[iAlpha], %[iBeta]            \n\t"
+    "dmtc1      $10, $f4                                  \n\t"
+    "mov.d      $f8, $f4                                  \n\t"
+    "dmtc1      %[iAlpha], $f16                           \n\t"
+    "and        %[iAlpha], $8, %[iBeta]                   \n\t"
+    "dmtc1      %[iAlpha], $f20                           \n\t"
+    "mov.d      $f24, $f20                                \n\t"
+    "mov.d      $f28, $f20                                \n\t"
+    "gssqc1     $f2, $f0, 432-336(%[tmp])                 \n\t"
+    "dmtc1      %[iAlpha], $f0                            \n\t"
+
+    "lb         %[iAlpha], 0x3(%[pTC])                    \n\t"
+    "lb         %[pTC], 0x2(%[pTC])                       \n\t"
+    "dmtc1      $10, $f12                                 \n\t"
+    "punpcklhw  $f0, $f0, $f16                            \n\t"
+    "and        $8, %[iAlpha], %[iBeta]                   \n\t"
+    "punpcklhw  $f24, $f24, $f8                           \n\t"
+    "punpcklhw  $f20, $f20, $f4                           \n\t"
+    "punpcklhw  $f0, $f0, $f24                            \n\t"
+    "punpcklhw  $f28, $f28, $f12                          \n\t"
+    "punpcklhw  $f28, $f28, $f20                          \n\t"
+    "punpckhhw  $f2, $f0, $f28                            \n\t"
+    "punpcklhw  $f0, $f0, $f28                            \n\t"
+    "gssqc1     $f2, $f0, 432-400(%[tmp])                 \n\t"
+    "dmtc1      $8, $f0                                   \n\t"
+    "and        %[iAlpha], %[iAlpha], %[iBeta]            \n\t"
+    "mov.d      $f8, $f0                                  \n\t"
+    "dmtc1      %[iAlpha], $f16                           \n\t"
+    "and        %[iAlpha], %[pTC], %[iBeta]               \n\t"
+    "dmtc1      $8, $f12                                  \n\t"
+    "dmtc1      %[iAlpha], $f20                           \n\t"
+    "punpcklhw  $f20, $f20, $f0                           \n\t"
+
+    "xor        $f0, $f0, $f0                             \n\t"
+    "dmtc1      %[iAlpha], $f24                           \n\t"
+    "and        %[pTC], %[pTC], %[iBeta]                  \n\t"
+    "punpcklhw  $f24, $f24, $f8                           \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "dmtc1      %[pTC], $f4                               \n\t"
+
+    "gslqc1     $f10, $f8, 0x0($9)                        \n\t"
+    "punpckhbh  $f10, $f8, $f0                            \n\t"
+    "punpcklbh  $f8, $f8, $f0                             \n\t"
+
+    "dli        %[iAlpha], 0x4                            \n\t"
+    "seh        %[pTC], %[iAlpha]                         \n\t"
+    "punpcklhw  $f28, $f28, $f12                          \n\t"
+    "punpcklhw  $f28, $f28, $f20                          \n\t"
+    "gslqc1     $f22, $f20, 0x0(%[iStride])               \n\t"
+    "gslqc1     $f14, $f12, 0x0($13)                      \n\t"
+    "gsldxc1    $f2, 0x0($12, $0)                         \n\t"
+    "punpckhbh  $f22, $f20, $f0                           \n\t"
+    "punpcklbh  $f20, $f20, $f0                           \n\t"
+    "gssqc1     $f22, $f20, 432-240(%[tmp])               \n\t"
+    "punpckhbh  $f22, $f2, $f0                            \n\t"
+    "punpcklbh  $f20, $f2, $f0                            \n\t"
+    "gssqc1     $f22, $f20, 432-352(%[tmp])               \n\t"
+    "punpcklhw  $f4, $f4, $f16                            \n\t"
+    "gslqc1     $f18, $f16, 0x0($14)                      \n\t"
+    "punpcklhw  $f4, $f4, $f24                            \n\t"
+    "gslqc1     $f26, $f24, 0x0(%[pPix])                  \n\t"
+    "punpckhhw  $f6, $f4, $f28                            \n\t"
+    "punpcklhw  $f4, $f4, $f28                            \n\t"
+    "punpckhbh  $f26, $f24, $f0                           \n\t"
+    "punpcklbh  $f24, $f24, $f0                           \n\t"
+    "punpckhbh  $f14, $f12, $f0                           \n\t"
+    "punpcklbh  $f12, $f12, $f0                           \n\t"
+    "punpckhbh  $f18, $f16, $f0                           \n\t"
+    "punpcklbh  $f16, $f16, $f0                           \n\t"
+    "psubh      $f28, $f12, $f16                          \n\t"
+    "psubh      $f30, $f14, $f18                          \n\t"
+    "gssqc1     $f18, $f16, 432-272(%[tmp])               \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f16, $f18)
+    "gslqc1     $f18, $f16, 432-336(%[tmp])               \n\t"
+    "gslqc1     $f2, $f0, 432-352(%[tmp])                 \n\t"
+    "pcmpgth    $f20, $f16, $f28                          \n\t"
+    "pcmpgth    $f22, $f18, $f30                          \n\t"
+    "gssqc1     $f22, $f20, 432-288(%[tmp])               \n\t"
+    "psubh      $f28, $f24, $f0                           \n\t"
+    "psubh      $f30, $f26, $f2                           \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
+    "pcmpgth    $f20, $f16, $f28                          \n\t"
+    "pcmpgth    $f22, $f18, $f30                          \n\t"
+    "gssqc1     $f22, $f20, 432-256(%[tmp])               \n\t"
+    "pavgh      $f20, $f12, $f24                          \n\t"
+    "pavgh      $f22, $f14, $f26                          \n\t"
+    "gssqc1     $f22, $f20, 432-304(%[tmp])               \n\t"
+    "gslqc1     $f22, $f20, 432-400(%[tmp])               \n\t"
+    "gslqc1     $f30, $f28, 432-288(%[tmp])               \n\t"
+    "gslqc1     $f2, $f0, 432-256(%[tmp])                 \n\t"
+    "psubh      $f20, $f20, $f28                          \n\t"
+    "psubh      $f22, $f22, $f30                          \n\t"
+    "psubh      $f20, $f20, $f0                           \n\t"
+    "psubh      $f22, $f22, $f2                           \n\t"
+    "gssqc1     $f22, $f20, 432-224(%[tmp])               \n\t"
+    "gslqc1     $f2, $f0, 432-240(%[tmp])                 \n\t"
+    "psubh      $f20, $f24, $f12                          \n\t"
+    "psubh      $f22, $f26, $f14                          \n\t"
+    "gssqc1     $f26, $f24, 432-32(%[tmp])                \n\t"
+    "psubh      $f24, $f24, $f0                           \n\t"
+    "psubh      $f26, $f26, $f2                           \n\t"
+    "gssqc1     $f22, $f20, 432-384(%[tmp])               \n\t"
+    WELS_AbsH($f28, $f30, $f20, $f22, $f28, $f30)
+    "gslqc1     $f22, $f20, 432-112(%[tmp])               \n\t"
+    "pcmpgth    $f20, $f20, $f28                          \n\t"
+    "pcmpgth    $f22, $f22, $f30                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+    "pcmpgth    $f28, $f16, $f24                          \n\t"
+    "pcmpgth    $f30, $f18, $f26                          \n\t"
+
+    "xor        $f0, $f0, $f0                             \n\t"
+    "and        $f20, $f20, $f28                          \n\t"
+    "and        $f22, $f22, $f30                          \n\t"
+    "psubh      $f24, $f12, $f8                           \n\t"
+    "psubh      $f26, $f14, $f10                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+    "pcmpgth    $f28, $f16, $f24                          \n\t"
+    "pcmpgth    $f30, $f18, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 432-400(%[tmp])               \n\t"
+    "and        $f20, $f20, $f28                          \n\t"
+    "and        $f22, $f22, $f30                          \n\t"
+    "pcmpgth    $f28, $f24, $f0                           \n\t"
+    "pcmpgth    $f30, $f26, $f0                           \n\t"
+    "pcmpeqh    $f24, $f24, $f0                           \n\t"
+    "pcmpeqh    $f26, $f26, $f0                           \n\t"
+    "or         $f28, $f28, $f24                          \n\t"
+    "or         $f30, $f30, $f26                          \n\t"
+    "and        $f20, $f20, $f28                          \n\t"
+    "and        $f22, $f22, $f30                          \n\t"
+    "gssqc1     $f22, $f20, 432-320(%[tmp])               \n\t"
+    "dmtc1      %[pTC], $f20                              \n\t"
+    "punpckhhw  $f26, $f20, $f20                          \n\t"
+    "punpcklhw  $f24, $f20, $f20                          \n\t"
+    "punpcklwd  $f20, $f24, $f24                          \n\t"
+    "mov.d      $f22, $f20                                \n\t"
+    "gssqc1     $f22, $f20, 432-336(%[tmp])               \n\t"
+    "gslqc1     $f22, $f20, 432-224(%[tmp])               \n\t"
+    "psubh      $f24, $f0, $f20                           \n\t"
+    "dli        $11, 0x2                                  \n\t"
+    "psubh      $f26, $f0, $f22                           \n\t"
+    "dmtc1      $11, $f28                                 \n\t"
+    "gslqc1     $f22, $f20, 432-384(%[tmp])               \n\t"
+    "gslqc1     $f2, $f0, 432-240(%[tmp])                 \n\t"
+    "psllh      $f20, $f20, $f28                          \n\t"
+    "psllh      $f22, $f22, $f28                          \n\t"
+    "psubh      $f28, $f8, $f0                            \n\t"
+    "psubh      $f30, $f10, $f2                           \n\t"
+    "paddh      $f28, $f28, $f20                          \n\t"
+    "paddh      $f30, $f30, $f22                          \n\t"
+    "gslqc1     $f22, $f20, 432-336(%[tmp])               \n\t"
+    "paddh      $f28, $f28, $f20                          \n\t"
+    "paddh      $f30, $f30, $f22                          \n\t"
+    "dli        $11, 0x3                                  \n\t"
+    "dmtc1      $11, $f20                                 \n\t"
+    "psrah      $f28, $f28, $f20                          \n\t"
+    "psrah      $f30, $f30, $f20                          \n\t"
+    "gslqc1     $f22, $f20, 432-224(%[tmp])               \n\t"
+    "pmaxsh     $f24, $f24, $f28                          \n\t"
+    "pmaxsh     $f26, $f26, $f30                          \n\t"
+    "gslqc1     $f2, $f0, 432-320(%[tmp])                 \n\t"
+    "pminsh     $f20, $f20, $f24                          \n\t"
+    "pminsh     $f22, $f22, $f26                          \n\t"
+
+    "and        $f20, $f20, $f0                           \n\t"
+    "and        $f22, $f22, $f2                           \n\t"
+    "gslqc1     $f26, $f24, 432-400(%[tmp])               \n\t"
+    "gssqc1     $f22, $f20, 432-64(%[tmp])                \n\t"
+    "xor        $f0, $f0, $f0                             \n\t"
+    "gssqc1     $f26, $f24, 432-384(%[tmp])               \n\t"
+    "psubh      $f20, $f0, $f24                           \n\t"
+    "psubh      $f22, $f0, $f26                           \n\t"
+    "gssqc1     $f22, $f20, 432-368(%[tmp])               \n\t"
+    "mov.d      $f24, $f20                                \n\t"
+    "mov.d      $f26, $f22                                \n\t"
+    "gslqc1     $f22, $f20, 432-272(%[tmp])               \n\t"
+    "gslqc1     $f30, $f28, 432-304(%[tmp])               \n\t"
+    "paddh      $f20, $f20, $f28                          \n\t"
+    "paddh      $f22, $f22, $f30                          \n\t"
+    "paddh      $f28, $f8, $f8                            \n\t"
+    "paddh      $f30, $f10, $f10                          \n\t"
+    "psubh      $f20, $f20, $f28                          \n\t"
+    "psubh      $f22, $f22, $f30                          \n\t"
+    "dli        $11, 0x1                                  \n\t"
+    "dmtc1      $11, $f28                                 \n\t"
+    "psrah      $f20, $f20, $f28                          \n\t"
+    "psrah      $f22, $f22, $f28                          \n\t"
+    "pmaxsh     $f24, $f24, $f20                          \n\t"
+    "pmaxsh     $f26, $f26, $f22                          \n\t"
+    "gslqc1     $f22, $f20, 432-384(%[tmp])               \n\t"
+    "pminsh     $f20, $f20, $f24                          \n\t"
+    "pminsh     $f22, $f22, $f26                          \n\t"
+
+    "gslqc1     $f26, $f24, 432-320(%[tmp])               \n\t"
+    "gslqc1     $f30, $f28, 432-288(%[tmp])               \n\t"
+    "and        $f20, $f20, $f24                          \n\t"
+    "and        $f22, $f22, $f26                          \n\t"
+    "and        $f20, $f20, $f28                          \n\t"
+    "and        $f22, $f22, $f30                          \n\t"
+    "gslqc1     $f26, $f24, 432-240(%[tmp])               \n\t"
+    "gssqc1     $f22, $f20, 432-96(%[tmp])                \n\t"
+    "gslqc1     $f22, $f20, 432-352(%[tmp])               \n\t"
+    "gslqc1     $f30, $f28, 432-304(%[tmp])               \n\t"
+    "paddh      $f20, $f20, $f28                          \n\t"
+    "paddh      $f22, $f22, $f30                          \n\t"
+    "paddh      $f28, $f24, $f24                          \n\t"
+    "paddh      $f30, $f26, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 432-368(%[tmp])               \n\t"
+    "dli        $11, 0x1                                  \n\t"
+    "psubh      $f20, $f20, $f28                          \n\t"
+    "dmtc1      $11, $f28                                 \n\t"
+    "psubh      $f22, $f22, $f30                          \n\t"
+
+    "psrah      $f20, $f20, $f28                          \n\t"
+    "psrah      $f22, $f22, $f28                          \n\t"
+    "gslqc1     $f30, $f28, 0x0(%[iStride])               \n\t"
+    "pmaxsh     $f24, $f24, $f20                          \n\t"
+    "pmaxsh     $f26, $f26, $f22                          \n\t"
+    "gslqc1     $f22, $f20, 432-400(%[tmp])               \n\t"
+    "pminsh     $f20, $f20, $f24                          \n\t"
+    "pminsh     $f22, $f22, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 432-320(%[tmp])               \n\t"
+    "and        $f20, $f20, $f24                          \n\t"
+    "and        $f22, $f22, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 432-256(%[tmp])               \n\t"
+    "and        $f20, $f20, $f24                          \n\t"
+    "and        $f22, $f22, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 0x0($9)                       \n\t"
+    "punpcklbh  $f28, $f30, $f0                           \n\t"
+    "punpckhbh  $f30, $f30, $f0                           \n\t"
+    "gssqc1     $f30, $f28, 432-352(%[tmp])               \n\t"
+
+    "gslqc1     $f30, $f28, 0x0($12)                      \n\t"
+    "punpcklbh  $f24, $f26, $f0                           \n\t"
+    "punpckhbh  $f26, $f26, $f0                           \n\t"
+    "gssqc1     $f22, $f20, 432-48(%[tmp])                \n\t"
+    "gslqc1     $f22, $f20, 0x0($14)                      \n\t"
+    "gssqc1     $f26, $f24, 432-368(%[tmp])               \n\t"
+    "gslqc1     $f26, $f24, 0x0($13)                      \n\t"
+    "punpcklbh  $f28, $f30, $f0                           \n\t"
+    "punpckhbh  $f30, $f30, $f0                           \n\t"
+    "punpcklbh  $f20, $f22, $f0                           \n\t"
+    "punpckhbh  $f22, $f22, $f0                           \n\t"
+    "gssqc1     $f30, $f28, 432-384(%[tmp])               \n\t"
+    "punpcklbh  $f24, $f26, $f0                           \n\t"
+    "punpckhbh  $f26, $f26, $f0                           \n\t"
+    "gssqc1     $f26, $f24, 432-400(%[tmp])               \n\t"
+
+    "gslqc1     $f30, $f28, 432-400(%[tmp])               \n\t"
+    "gslqc1     $f26, $f24, 0x0(%[pPix])                  \n\t"
+    "psubh      $f28, $f28, $f20                          \n\t"
+    "psubh      $f30, $f30, $f22                          \n\t"
+    "gssqc1     $f22, $f20, 432-16(%[tmp])                \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
+    "punpcklbh  $f24, $f26, $f0                           \n\t"
+    "punpckhbh  $f26, $f26, $f0                           \n\t"
+    "pcmpgth    $f20, $f16, $f28                          \n\t"
+    "pcmpgth    $f22, $f18, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 432-384(%[tmp])               \n\t"
+    "gssqc1     $f22, $f20, 432-288(%[tmp])               \n\t"
+
+    "psubh      $f28, $f24, $f28                          \n\t"
+    "psubh      $f30, $f26, $f30                          \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
+    "pcmpgth    $f20, $f16, $f28                          \n\t"
+    "pcmpgth    $f22, $f18, $f30                          \n\t"
+    "gssqc1     $f22, $f20, 432-256(%[tmp])               \n\t"
+
+    "gslqc1     $f22, $f20, 432-400(%[tmp])               \n\t"
+    "gssqc1     $f26, $f24, 432-80(%[tmp])                \n\t"
+    "pavgh      $f20, $f20, $f24                          \n\t"
+    "pavgh      $f22, $f22, $f26                          \n\t"
+    "gssqc1     $f22, $f20, 432-304(%[tmp])               \n\t"
+
+    "gslqc1     $f22, $f20, 432-288(%[tmp])               \n\t"
+    "gslqc1     $f30, $f28, 432-256(%[tmp])               \n\t"
+    "psubh      $f20, $f4, $f20                           \n\t"
+    "psubh      $f22, $f6, $f22                           \n\t"
+    "psubh      $f20, $f20, $f28                          \n\t"
+    "psubh      $f22, $f22, $f30                          \n\t"
+    "gssqc1     $f22, $f20, 432-224(%[tmp])               \n\t"
+    "gslqc1     $f22, $f20, 432-400(%[tmp])               \n\t"
+    "gslqc1     $f30, $f28, 432-352(%[tmp])               \n\t"
+    "psubh      $f20, $f24, $f20                          \n\t"
+    "psubh      $f22, $f26, $f22                          \n\t"
+    "psubh      $f24, $f24, $f28                          \n\t"
+    "psubh      $f26, $f26, $f30                          \n\t"
+    "gssqc1     $f22, $f20, 432-272(%[tmp])               \n\t"
+    "mov.d      $f28, $f20                                \n\t"
+    "mov.d      $f30, $f22                                \n\t"
+    WELS_AbsH($f28, $f30, $f20, $f22, $f0, $f2)
+    "gslqc1     $f22, $f20, 432-112(%[tmp])               \n\t"
+    "pcmpgth    $f20, $f20, $f28                          \n\t"
+    "pcmpgth    $f22, $f22, $f30                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+    "pcmpgth    $f28, $f16, $f24                          \n\t"
+    "pcmpgth    $f30, $f18, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 432-368(%[tmp])               \n\t"
+
+    "and        $f20, $f20, $f28                          \n\t"
+    "and        $f22, $f22, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 432-400(%[tmp])               \n\t"
+    "psubh      $f28, $f28, $f24                          \n\t"
+    "psubh      $f30, $f30, $f26                          \n\t"
+    "gslqc1     $f2, $f0, 432-352(%[tmp])                 \n\t"
+    "psubh      $f24, $f24, $f0                           \n\t"
+    "psubh      $f26, $f26, $f2                           \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f0, $f2)
+    "pcmpgth    $f16, $f16, $f28                          \n\t"
+    "pcmpgth    $f18, $f18, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 432-96(%[tmp])                \n\t"
+    "and        $f20, $f20, $f16                          \n\t"
+    "and        $f22, $f22, $f18                          \n\t"
+    "xor        $f0, $f0, $f0                             \n\t"
+
+    "paddh      $f8, $f8, $f28                            \n\t"
+    "paddh      $f10, $f10, $f30                          \n\t"
+    "pcmpgth    $f16, $f4, $f0                            \n\t"
+    "pcmpgth    $f18, $f6, $f0                            \n\t"
+    "pcmpeqh    $f28, $f4, $f0                            \n\t"
+    "pcmpeqh    $f30, $f6, $f0                            \n\t"
+    "or         $f16, $f16, $f28                          \n\t"
+    "or         $f18, $f18, $f30                          \n\t"
+    "and        $f20, $f20, $f16                          \n\t"
+    "and        $f22, $f22, $f18                          \n\t"
+    "gslqc1     $f18, $f16, 432-224(%[tmp])               \n\t"
+    "gssqc1     $f22, $f20, 432-320(%[tmp])               \n\t"
+    "gslqc1     $f22, $f20, 432-272(%[tmp])               \n\t"
+    "dli        $11, 0x2                                  \n\t"
+    "psubh      $f28, $f0, $f16                           \n\t"
+    "psubh      $f30, $f0, $f18                           \n\t"
+    "psubh      $f2, $f0, $f6                             \n\t"
+    "psubh      $f0, $f0, $f4                             \n\t"
+    "dmfc1      %[iAlpha], $f28                           \n\t"
+    "dmtc1      $11, $f28                                 \n\t"
+    "psllh      $f20, $f20, $f28                          \n\t"
+    "psllh      $f22, $f22, $f28                          \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "paddh      $f24, $f24, $f20                          \n\t"
+    "paddh      $f26, $f26, $f22                          \n\t"
+    "gslqc1     $f22, $f20, 432-336(%[tmp])               \n\t"
+    "paddh      $f24, $f24, $f20                          \n\t"
+    "paddh      $f26, $f26, $f22                          \n\t"
+    "gslqc1     $f22, $f20, 432-368(%[tmp])               \n\t"
+    "dli        $11, 0x3                                  \n\t"
+    "gssqc1     $f2, $f0, 432-336(%[tmp])                 \n\t"
+    "dmfc1      %[iAlpha], $f0                            \n\t"
+    "dmtc1      $11, $f0                                  \n\t"
+    "psrah      $f24, $f24, $f0                           \n\t"
+    "psrah      $f26, $f26, $f0                           \n\t"
+    "dmtc1      %[iAlpha], $f0                            \n\t"
+    "pmaxsh     $f28, $f28, $f24                          \n\t"
+    "pmaxsh     $f30, $f30, $f26                          \n\t"
+    "pminsh     $f16, $f16, $f28                          \n\t"
+    "pminsh     $f18, $f18, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 432-320(%[tmp])               \n\t"
+    "and        $f16, $f16, $f28                          \n\t"
+    "and        $f18, $f18, $f30                          \n\t"
+    "mov.d      $f24, $f0                                 \n\t"
+    "mov.d      $f26, $f2                                 \n\t"
+    "gslqc1     $f2, $f0, 432-16(%[tmp])                  \n\t"
+    "gslqc1     $f30, $f28, 432-304(%[tmp])               \n\t"
+    "paddh      $f0, $f0, $f28                            \n\t"
+    "paddh      $f2, $f2, $f30                            \n\t"
+    "gssqc1     $f18, $f16, 432-272(%[tmp])               \n\t"
+    "gslqc1     $f18, $f16, 432-368(%[tmp])               \n\t"
+    "dli        $11, 0x1                                  \n\t"
+    "paddh      $f16, $f16, $f16                          \n\t"
+    "paddh      $f18, $f18, $f18                          \n\t"
+    "psubh      $f0, $f0, $f16                            \n\t"
+    "psubh      $f2, $f2, $f18                            \n\t"
+
+    "dmtc1      $11, $f28                                 \n\t"
+    "gslqc1     $f18, $f16, 432-64(%[tmp])                \n\t"
+    "psrah      $f0, $f0, $f28                            \n\t"
+    "psrah      $f2, $f2, $f28                            \n\t"
+    "pmaxsh     $f24, $f24, $f0                           \n\t"
+    "pmaxsh     $f26, $f26, $f2                           \n\t"
+    "gslqc1     $f2, $f0, 432-400(%[tmp])                 \n\t"
+    "pminsh     $f28, $f4, $f24                           \n\t"
+    "pminsh     $f30, $f6, $f26                           \n\t"
+    "gslqc1     $f26, $f24, 432-320(%[tmp])               \n\t"
+    "and        $f28, $f28, $f24                          \n\t"
+    "and        $f30, $f30, $f26                          \n\t"
+    "dmfc1      %[iAlpha], $f24                           \n\t"
+    "dmfc1      %[iBeta], $f26                            \n\t"
+    "gslqc1     $f26, $f24, 432-288(%[tmp])               \n\t"
+    "and        $f28, $f28, $f24                          \n\t"
+    "and        $f30, $f30, $f26                          \n\t"
+    "paddh      $f20, $f20, $f28                          \n\t"
+    "paddh      $f22, $f22, $f30                          \n\t"
+    "packushb   $f8, $f8, $f10                            \n\t"
+    "packushb   $f10, $f20, $f22                          \n\t"
+    "gslqc1     $f22, $f20, 432-272(%[tmp])               \n\t"
+    "paddh      $f0, $f0, $f20                            \n\t"
+    "paddh      $f2, $f2, $f22                            \n\t"
+    "paddh      $f12, $f12, $f16                          \n\t"
+    "paddh      $f14, $f14, $f18                          \n\t"
+    "packushb   $f12, $f12, $f14                          \n\t"
+    "packushb   $f14, $f0, $f2                            \n\t"
+
+    "gslqc1     $f2, $f0, 432-32(%[tmp])                  \n\t"
+    "psubh      $f0, $f0, $f16                            \n\t"
+    "psubh      $f2, $f2, $f18                            \n\t"
+    "gslqc1     $f18, $f16, 432-80(%[tmp])                \n\t"
+    "psubh      $f16, $f16, $f20                          \n\t"
+    "gslqc1     $f26, $f24, 432-48(%[tmp])                \n\t"
+    "psubh      $f18, $f18, $f22                          \n\t"
+
+    "gslqc1     $f22, $f20, 432-240(%[tmp])               \n\t"
+    "paddh      $f20, $f20, $f24                          \n\t"
+    "paddh      $f22, $f22, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 432-304(%[tmp])               \n\t"
+    "packushb   $f0, $f0, $f2                             \n\t"
+    "packushb   $f2, $f16, $f18                           \n\t"
+    "gslqc1     $f18, $f16, 432-384(%[tmp])               \n\t"
+    "paddh      $f16, $f16, $f24                          \n\t"
+    "paddh      $f18, $f18, $f26                          \n\t"
+    "gssqc1     $f2, $f0, 480-208(%[tmp])                 \n\t"
+    "gslqc1     $f2, $f0, 432-352(%[tmp])                 \n\t"
+    "mov.d      $f28, $f0                                 \n\t"
+    "mov.d      $f30, $f2                                 \n\t"
+    "paddh      $f0, $f0, $f0                             \n\t"
+    "paddh      $f2, $f2, $f2                             \n\t"
+
+    "dmtc1      %[iAlpha], $f24                           \n\t"
+    "dmtc1      %[iBeta], $f26                            \n\t"
+
+    "psubh      $f16, $f16, $f0                           \n\t"
+    "psubh      $f18, $f18, $f2                           \n\t"
+    "dli        $11, 0x1                                  \n\t"
+    "gslqc1     $f2, $f0, 432-336(%[tmp])                 \n\t"
+    "gssqc1     $f10, $f8, 0x0($9)                        \n\t"
+    "dmtc1      $11, $f8                                  \n\t"
+    "psrah      $f16, $f16, $f8                           \n\t"
+    "psrah      $f18, $f18, $f8                           \n\t"
+    "pmaxsh     $f0, $f0, $f16                            \n\t"
+    "pmaxsh     $f2, $f2, $f18                            \n\t"
+    "pminsh     $f4, $f4, $f0                             \n\t"
+    "pminsh     $f6, $f6, $f2                             \n\t"
+    "gslqc1     $f2, $f0, 480-208(%[tmp])                 \n\t"
+
+    "gslqc1     $f10, $f8, 428-256+4(%[tmp])              \n\t"
+    "and        $f4, $f4, $f24                            \n\t"
+    "and        $f6, $f6, $f26                            \n\t"
+    "and        $f4, $f4, $f8                             \n\t"
+    "and        $f6, $f6, $f10                            \n\t"
+    "gssqc1     $f14, $f12, 0x0($13)                      \n\t"
+    "paddh      $f28, $f28, $f4                           \n\t"
+    "paddh      $f30, $f30, $f6                           \n\t"
+    "packushb   $f20, $f20, $f22                          \n\t"
+    "packushb   $f22, $f28, $f30                          \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPix])                    \n\t"
+    "gssqc1     $f22, $f20, 0x0(%[iStride])               \n\t"
+    : [pPix]"+&r"((unsigned char *)pPix)
+    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
+      [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
+      "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+      "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void DeblockLumaTransposeH2V_mmi(uint8_t *pPixY, int32_t iStride,
+                                 uint8_t *pDst) {
+  BACKUP_REG;
+  __asm__ volatile(
+    ".set       arch=loongson3a                           \n\t"
+    "dsll       $8, %[iStride], 0x3                       \n\t"
+    "daddu      $8, $8, %[pPixY]                          \n\t"
+
+    "daddu      $9, %[pPixY], %[iStride]                  \n\t"
+    "daddu      $10, $8, %[iStride]                       \n\t"
+    "gsldlc1    $f0, 0x7(%[pPixY])                        \n\t"
+    "gsldlc1    $f2, 0x7($8)                              \n\t"
+    "gsldlc1    $f4, 0x7($9)                              \n\t"
+    "gsldlc1    $f6, 0x7($10)                             \n\t"
+    "gsldrc1    $f0, 0x0(%[pPixY])                        \n\t"
+    "gsldrc1    $f2, 0x0($8)                              \n\t"
+    "gsldrc1    $f4, 0x0($9)                              \n\t"
+    "gsldrc1    $f6, 0x0($10)                             \n\t"
+    "daddu      %[pPixY], $9, %[iStride]                  \n\t"
+    "daddu      $8, $10, %[iStride]                       \n\t"
+    "daddu      $9, %[pPixY], %[iStride]                  \n\t"
+    "daddu      $10, $8, %[iStride]                       \n\t"
+    "gsldlc1    $f8, 0x7(%[pPixY])                        \n\t"
+    "gsldlc1    $f10, 0x7($8)                             \n\t"
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0x7($10)                            \n\t"
+    "gsldrc1    $f8, 0x0(%[pPixY])                        \n\t"
+    "gsldrc1    $f10, 0x0($8)                             \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x0($10)                            \n\t"
+
+    "daddu      %[pPixY], $9, %[iStride]                  \n\t"
+    "daddu      $8, $10, %[iStride]                       \n\t"
+    "daddu      $9, %[pPixY], %[iStride]                  \n\t"
+    "daddu      $10, $8, %[iStride]                       \n\t"
+    "gsldlc1    $f16, 0x7(%[pPixY])                       \n\t"
+    "gsldlc1    $f18, 0x7($8)                             \n\t"
+    "gsldlc1    $f20, 0x7($9)                             \n\t"
+    "gsldlc1    $f22, 0x7($10)                            \n\t"
+    "gsldrc1    $f16, 0x0(%[pPixY])                       \n\t"
+    "gsldrc1    $f18, 0x0($8)                             \n\t"
+    "gsldrc1    $f20, 0x0($9)                             \n\t"
+    "gsldrc1    $f22, 0x0($10)                            \n\t"
+    "daddu      %[pPixY], $9, %[iStride]                  \n\t"
+    "daddu      $8, $10, %[iStride]                       \n\t"
+    "daddu      $9, %[pPixY], %[iStride]                  \n\t"
+    "daddu      $10, $8, %[iStride]                       \n\t"
+    "gsldlc1    $f24, 0x7(%[pPixY])                       \n\t"
+    "gsldlc1    $f26, 0x7($8)                             \n\t"
+
+    "gsldlc1    $f28, 0x7($9)                             \n\t"
+    "gsldlc1    $f30, 0x7($10)                            \n\t"
+    "gsldrc1    $f24, 0x0(%[pPixY])                       \n\t"
+    "gsldrc1    $f26, 0x0($8)                             \n\t"
+    "gsldrc1    $f28, 0x0($9)                             \n\t"
+    "gsldrc1    $f30, 0x0($10)                            \n\t"
+
+    MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
+                     $f14, $f16, $f18, $f20, $f22, $f24,
+                     $f26, $f28, $f30, $9, $10)
+
+    "gssqc1     $f18, $f16, 0x0(%[pDst])                  \n\t"
+    "gssqc1     $f10, $f8, 0x10(%[pDst])                  \n\t"
+    "gssqc1     $f14, $f12, 0x20(%[pDst])                 \n\t"
+    "gssqc1     $f30, $f28, 0x30(%[pDst])                 \n\t"
+    "gssqc1     $f22, $f20, 0x40(%[pDst])                 \n\t"
+    "gssqc1     $f6, $f4, 0x50(%[pDst])                   \n\t"
+    "gssqc1     $f26, $f24, 0x60(%[pDst])                 \n\t"
+    "gssqc1     $f2, $f0, 0x70(%[pDst])                   \n\t"
+    : [pPixY] "+&r"((unsigned char *)pPixY)
+    : [iStride] "r"((int)iStride), [pDst] "r"((unsigned char *)pDst)
+    : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
+      "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
+      "$f30"
+  );
+  RECOVER_REG;
+}
+
+void DeblockLumaTransposeV2H_mmi(uint8_t *pPixY, int32_t iStride,
+                                 uint8_t *pSrc) {
+  BACKUP_REG;
+  __asm__ volatile(
+    ".set       arch=loongson3a                           \n\t"
+    "gslqc1     $f2, $f0, 0x0(%[pSrc])                    \n\t"
+    "gslqc1     $f6, $f4, 0x10(%[pSrc])                   \n\t"
+    "gslqc1     $f10, $f8, 0x20(%[pSrc])                  \n\t"
+    "gslqc1     $f14, $f12, 0x30(%[pSrc])                 \n\t"
+    "gslqc1     $f18, $f16, 0x40(%[pSrc])                 \n\t"
+    "gslqc1     $f22, $f20, 0x50(%[pSrc])                 \n\t"
+    "gslqc1     $f26, $f24, 0x60(%[pSrc])                 \n\t"
+    "gslqc1     $f30, $f28, 0x70(%[pSrc])                 \n\t"
+
+    MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
+                     $f14, $f16, $f18, $f20, $f22, $f24,
+                     $f26, $f28, $f30, $9, $10)
+
+    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
+    "gssdlc1    $f16, 0x7(%[pPixY])                       \n\t"
+    "gssdlc1    $f8, 0x7($8)                              \n\t"
+    "gssdrc1    $f16, 0x0(%[pPixY])                       \n\t"
+    "gssdrc1    $f8, 0x0($8)                              \n\t"
+    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
+    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
+    "gssdlc1    $f12, 0x7(%[pPixY])                       \n\t"
+    "gssdlc1    $f28, 0x7($8)                             \n\t"
+    "gssdrc1    $f12, 0x0(%[pPixY])                       \n\t"
+    "gssdrc1    $f28, 0x0($8)                             \n\t"
+
+    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
+    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
+    "gssdlc1    $f20, 0x7(%[pPixY])                       \n\t"
+    "gssdlc1    $f4, 0x7($8)                              \n\t"
+    "gssdrc1    $f20, 0x0(%[pPixY])                       \n\t"
+    "gssdrc1    $f4, 0x0($8)                              \n\t"
+    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
+    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
+    "gssdlc1    $f24, 0x7(%[pPixY])                       \n\t"
+    "gssdlc1    $f0, 0x7($8)                              \n\t"
+    "gssdrc1    $f24, 0x0(%[pPixY])                       \n\t"
+    "gssdrc1    $f0, 0x0($8)                              \n\t"
+
+    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
+    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
+    "gssdlc1    $f18, 0x7(%[pPixY])                       \n\t"
+    "gssdlc1    $f10, 0x7($8)                             \n\t"
+    "gssdrc1    $f18, 0x0(%[pPixY])                       \n\t"
+    "gssdrc1    $f10, 0x0($8)                             \n\t"
+    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
+    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
+    "gssdlc1    $f14, 0x7(%[pPixY])                       \n\t"
+    "gssdlc1    $f30, 0x7($8)                             \n\t"
+    "gssdrc1    $f14, 0x0(%[pPixY])                       \n\t"
+    "gssdrc1    $f30, 0x0($8)                             \n\t"
+
+    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
+    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
+    "gssdlc1    $f22, 0x7(%[pPixY])                       \n\t"
+    "gssdlc1    $f6, 0x7($8)                              \n\t"
+    "gssdrc1    $f22, 0x0(%[pPixY])                       \n\t"
+    "gssdrc1    $f6, 0x0($8)                              \n\t"
+    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
+    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
+    "gssdlc1    $f26, 0x7(%[pPixY])                       \n\t"
+    "gssdlc1    $f2, 0x7($8)                              \n\t"
+    "gssdrc1    $f26, 0x0(%[pPixY])                       \n\t"
+    "gssdrc1    $f2, 0x0($8)                              \n\t"
+    : [pPixY] "+&r"((unsigned char *)pPixY)
+    : [iStride] "r"((int)iStride), [pSrc] "r"((unsigned char *)pSrc)
+    : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
+      "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
+      "$f30"
+  );
+  RECOVER_REG;
+}
+
+void DeblockLumaEq4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
+                         int32_t iBeta) {
+  unsigned char tmp[720] __attribute__((aligned(32)));
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "dsll       $11, %[iStride], 0x2                      \n\t"
+    "xor        $f8, $f8, $f8                             \n\t"
+    "daddu      $14, %[iStride], %[pPix]                  \n\t"
+    "dsubu      $8, %[pPix], $11                          \n\t"
+    "gslqc1     $f14, $f12, 0x0($8)                       \n\t"
+    "gslqc1     $f22, $f20, 0x0(%[pPix])                  \n\t"
+    "daddu      $9, %[iStride], %[iStride]                \n\t"
+    "daddu      $10, $9, %[iStride]                       \n\t"
+    "move       $12, $9                                   \n\t"
+    "dsubu      $8, %[pPix], $9                           \n\t"
+    "gslqc1     $f6, $f4, 0x0($8)                         \n\t"
+    "dsubu      $9, %[pPix], %[iStride]                   \n\t"
+    "gslqc1     $f18, $f16, 0x0($9)                       \n\t"
+    "daddu      $13, %[iStride], %[pPix]                  \n\t"
+
+    "move       %[iStride], $12                           \n\t"
+    "daddu      $15, $12, %[pPix]                         \n\t"
+
+    "daddu      $12, %[pPix], $10                         \n\t"
+    "dsubu      $11, %[pPix], $10                         \n\t"
+
+    "gslqc1     $f26, $f24, 0x0($11)                      \n\t"
+    "daddu      %[iStride], %[iStride], %[pPix]           \n\t"
+    "dmtc1      %[iAlpha], $f0                            \n\t"
+
+    "punpcklhw  $f28, $f0, $f0                            \n\t"
+    "punpcklwd  $f0, $f28, $f28                           \n\t"
+    "mov.d      $f2, $f0                                  \n\t"
+    "gssqc1     $f2, $f0, 640-320(%[tmp])                 \n\t"
+    "dmtc1      %[iBeta], $f0                             \n\t"
+    "gsldxc1    $f10, 0x0($15, $0)                        \n\t"
+    "punpcklhw  $f28, $f0, $f0                            \n\t"
+    "punpcklwd  $f0, $f28, $f28                           \n\t"
+    "punpckhbh  $f30, $f10, $f8                           \n\t"
+    "mov.d      $f2, $f0                                  \n\t"
+
+    "punpcklbh  $f28, $f10, $f8                           \n\t"
+    "gssqc1     $f2, $f0, 640-512(%[tmp])                 \n\t"
+    "gssqc1     $f30, $f28, 640-416(%[tmp])               \n\t"
+    "mov.d      $f0, $f4                                  \n\t"
+    "gssqc1     $f22, $f20, 704-272(%[tmp])               \n\t"
+    "gssqc1     $f6, $f4, 672-272(%[tmp])                 \n\t"
+    "mov.d      $f4, $f16                                 \n\t"
+    "punpckhbh  $f22, $f20, $f8                           \n\t"
+    "punpcklbh  $f20, $f20, $f8                           \n\t"
+    "punpckhbh  $f6, $f4, $f8                             \n\t"
+    "punpcklbh  $f4, $f4, $f8                             \n\t"
+
+    "psubh      $f28, $f20, $f4                           \n\t"
+    "psubh      $f30, $f22, $f6                           \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f2, $f10)
+    "gssqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
+    "punpckhbh  $f2, $f0, $f8                             \n\t"
+    "punpcklbh  $f0, $f0, $f8                             \n\t"
+    "gssqc1     $f18, $f16, 688-272(%[tmp])               \n\t"
+    "gslqc1     $f18, $f16, 0x0($14)                      \n\t"
+    "gssqc1     $f2, $f0, 640-480(%[tmp])                 \n\t"
+
+    "psubh      $f28, $f4, $f0                            \n\t"
+    "psubh      $f30, $f6, $f2                            \n\t"
+
+    "gslqc1     $f2, $f0, 640-512(%[tmp])                 \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f18, $f10)
+    "punpckhbh  $f18, $f16, $f8                           \n\t"
+    "punpcklbh  $f16, $f16, $f8                           \n\t"
+    "pcmpgth    $f0, $f0, $f28                            \n\t"
+    "pcmpgth    $f2, $f2, $f30                            \n\t"
+    "gssqc1     $f18, $f16, 640-384(%[tmp])               \n\t"
+    "psubh      $f28, $f20, $f16                          \n\t"
+    "psubh      $f30, $f22, $f18                          \n\t"
+    "gslqc1     $f18, $f16, 640-512(%[tmp])               \n\t"
+    "gssqc1     $f26, $f24, 656-272(%[tmp])               \n\t"
+    "punpckhbh  $f26, $f24, $f8                           \n\t"
+    "punpcklbh  $f24, $f24, $f8                           \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+    "gssqc1     $f26, $f24, 640-368(%[tmp])               \n\t"
+    "gssqc1     $f6, $f4, 640-144(%[tmp])                 \n\t"
+    "gssqc1     $f22, $f20, 640-400(%[tmp])               \n\t"
+    "pcmpgth    $f16, $f16, $f28                          \n\t"
+    "pcmpgth    $f18, $f18, $f30                          \n\t"
+    "and        $f0, $f0, $f16                            \n\t"
+    "and        $f2, $f2, $f18                            \n\t"
+    "gslqc1     $f18, $f16, 640-320(%[tmp])               \n\t"
+    "gslqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
+    "dli        %[iAlpha], 0x2                            \n\t"
+    "dli        %[iBeta], 0x2                             \n\t"
+    "pcmpgth    $f16, $f16, $f28                          \n\t"
+    "pcmpgth    $f18, $f18, $f30                          \n\t"
+    "and        $f0, $f0, $f16                            \n\t"
+    "and        $f2, $f2, $f18                            \n\t"
+    "dmtc1      %[iAlpha], $f16                           \n\t"
+    "dmtc1      %[iBeta], $f10                            \n\t"
+    "gssqc1     $f2, $f0, 736-272(%[tmp])                 \n\t"
+    "gslqc1     $f2, $f0, 640-320(%[tmp])                 \n\t"
+
+    "punpcklhw  $f28, $f16, $f16                          \n\t"
+    "psrah      $f16, $f0, $f10                           \n\t"
+    "psrah      $f18, $f2, $f10                           \n\t"
+    "punpcklwd  $f28, $f28, $f28                          \n\t"
+    "mov.d      $f30, $f28                                \n\t"
+    "gslqc1     $f10, $f8, 640-560(%[tmp])                \n\t"
+    "paddh      $f16, $f16, $f28                          \n\t"
+    "paddh      $f18, $f18, $f30                          \n\t"
+    "gssqc1     $f18, $f16, 640-576(%[tmp])               \n\t"
+    "pcmpgth    $f16, $f16, $f8                           \n\t"
+    "pcmpgth    $f18, $f18, $f10                          \n\t"
+    "gssqc1     $f18, $f16, 640-560(%[tmp])               \n\t"
+
+    "gssqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
+    "gslqc1     $f18, $f16, 640-512(%[tmp])               \n\t"
+    "psubh      $f28, $f4, $f24                           \n\t"
+    "psubh      $f30, $f6, $f26                           \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+    "gslqc1     $f10, $f8, 640-560(%[tmp])                \n\t"
+    "pcmpgth    $f16, $f16, $f28                          \n\t"
+    "pcmpgth    $f18, $f18, $f30                          \n\t"
+
+    "gslqc1     $f2, $f0, 640-416(%[tmp])                 \n\t"
+    "and        $f16, $f16, $f8                           \n\t"
+    "and        $f18, $f18, $f10                          \n\t"
+    "gssqc1     $f18, $f16, 640-544(%[tmp])               \n\t"
+    "gslqc1     $f18, $f16, 640-512(%[tmp])               \n\t"
+    "psubh      $f28, $f20, $f0                           \n\t"
+    "psubh      $f30, $f22, $f2                           \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+    "gslqc1     $f10, $f8, 640-560(%[tmp])                \n\t"
+    "pcmpgth    $f16, $f16, $f28                          \n\t"
+    "pcmpgth    $f18, $f18, $f30                          \n\t"
+
+    "and        $f16, $f16, $f8                           \n\t"
+    "and        $f18, $f18, $f10                          \n\t"
+    "gssqc1     $f18, $f16, 640-560(%[tmp])               \n\t"
+
+    "gslqc1     $f18, $f16, 640-544(%[tmp])               \n\t"
+    "xor        $f8, $f8, $f8                             \n\t"
+    "pandn      $f16, $f16, $f24                          \n\t"
+    "dli        %[iAlpha], 0x4                            \n\t"
+    "pandn      $f18, $f18, $f26                          \n\t"
+    "gssqc1     $f18, $f16, 640-16(%[tmp])                \n\t"
+    "dmtc1      %[iAlpha], $f16                           \n\t"
+    "punpcklhw  $f28, $f16, $f16                          \n\t"
+    "dli        %[iAlpha], 0x1                            \n\t"
+    "punpckhbh  $f18, $f12, $f8                           \n\t"
+    "dmtc1      %[iAlpha], $f30                           \n\t"
+    "punpcklbh  $f16, $f12, $f8                           \n\t"
+    "psllh      $f16, $f16, $f30                          \n\t"
+    "psllh      $f18, $f18, $f30                          \n\t"
+    "paddh      $f16, $f16, $f24                          \n\t"
+    "paddh      $f18, $f18, $f26                          \n\t"
+    "gslqc1     $f2, $f0, 640-480(%[tmp])                 \n\t"
+    "paddh      $f16, $f16, $f24                          \n\t"
+    "paddh      $f18, $f18, $f26                          \n\t"
+    "paddh      $f16, $f16, $f24                          \n\t"
+    "paddh      $f18, $f18, $f26                          \n\t"
+    "paddh      $f16, $f16, $f0                           \n\t"
+    "paddh      $f18, $f18, $f2                           \n\t"
+
+    "gslqc1     $f26, $f24, 640-560(%[tmp])               \n\t"
+    "punpcklwd  $f28, $f28, $f28                          \n\t"
+    "mov.d      $f30, $f28                                \n\t"
+    "paddh      $f16, $f16, $f4                           \n\t"
+    "paddh      $f18, $f18, $f6                           \n\t"
+    "gssqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
+    "paddh      $f16, $f16, $f20                          \n\t"
+    "paddh      $f18, $f18, $f22                          \n\t"
+    "paddh      $f16, $f16, $f28                          \n\t"
+    "paddh      $f18, $f18, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 640-416(%[tmp])               \n\t"
+    "gslqc1     $f2, $f0, 640-384(%[tmp])                 \n\t"
+    "pandn      $f24, $f24, $f28                          \n\t"
+    "pandn      $f26, $f26, $f30                          \n\t"
+    "gssqc1     $f26, $f24, 640-80(%[tmp])                \n\t"
+    "gslqc1     $f26, $f24, 0x0($12)                      \n\t"
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+    "punpckhbh  $f26, $f24, $f8                           \n\t"
+    "punpcklbh  $f24, $f24, $f8                           \n\t"
+    "psllh      $f24, $f24, $f10                          \n\t"
+    "psllh      $f26, $f26, $f10                          \n\t"
+    "paddh      $f24, $f24, $f28                          \n\t"
+    "paddh      $f26, $f26, $f30                          \n\t"
+    "paddh      $f24, $f24, $f28                          \n\t"
+    "paddh      $f26, $f26, $f30                          \n\t"
+    "paddh      $f24, $f24, $f28                          \n\t"
+    "paddh      $f26, $f26, $f30                          \n\t"
+    "paddh      $f24, $f24, $f0                           \n\t"
+    "paddh      $f26, $f26, $f2                           \n\t"
+
+    "dli        %[iAlpha], 0x3                            \n\t"
+    "gslqc1     $f30, $f28, 640-480(%[tmp])               \n\t"
+    "gslqc1     $f2, $f0, 640-592(%[tmp])                 \n\t"
+    "paddh      $f24, $f24, $f20                          \n\t"
+    "paddh      $f26, $f26, $f22                          \n\t"
+    "paddh      $f24, $f24, $f4                           \n\t"
+    "paddh      $f26, $f26, $f6                           \n\t"
+    "paddh      $f24, $f24, $f0                           \n\t"
+    "paddh      $f26, $f26, $f2                           \n\t"
+    "gslqc1     $f2, $f0, 640-560(%[tmp])                 \n\t"
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+    "psrah      $f24, $f24, $f10                          \n\t"
+    "psrah      $f26, $f26, $f10                          \n\t"
+    "and        $f24, $f24, $f0                           \n\t"
+    "and        $f26, $f26, $f2                           \n\t"
+    "gssqc1     $f26, $f24, 640-112(%[tmp])               \n\t"
+    "gslqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
+    "pandn      $f24, $f24, $f28                          \n\t"
+    "pandn      $f26, $f26, $f30                          \n\t"
+    "gssqc1     $f26, $f24, 640-336(%[tmp])               \n\t"
+    "gslqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
+    "gssqc1     $f26, $f24, 640-528(%[tmp])               \n\t"
+    "gslqc1     $f26, $f24, 640-368(%[tmp])               \n\t"
+    "gslqc1     $f2, $f0, 640-544(%[tmp])                 \n\t"
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+    "paddh      $f24, $f24, $f28                          \n\t"
+    "paddh      $f26, $f26, $f30                          \n\t"
+    "psrah      $f16, $f16, $f10                          \n\t"
+    "psrah      $f18, $f18, $f10                          \n\t"
+    "and        $f16, $f16, $f0                           \n\t"
+    "and        $f18, $f18, $f2                           \n\t"
+    "gslqc1     $f2, $f0, 640-624(%[tmp])                 \n\t"
+    "paddh      $f28, $f4, $f20                           \n\t"
+    "paddh      $f30, $f6, $f22                           \n\t"
+    "paddh      $f24, $f24, $f28                          \n\t"
+    "paddh      $f26, $f26, $f30                          \n\t"
+    "paddh      $f24, $f24, $f0                           \n\t"
+    "paddh      $f26, $f26, $f2                           \n\t"
+    "gslqc1     $f30, $f28, 640-528(%[tmp])               \n\t"
+    "dli        %[iAlpha], 0x2                            \n\t"
+
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+    "paddh      $f20, $f20, $f4                           \n\t"
+    "paddh      $f22, $f22, $f6                           \n\t"
+    "psrah      $f24, $f24, $f10                          \n\t"
+    "psrah      $f26, $f26, $f10                          \n\t"
+    "and        $f28, $f28, $f24                          \n\t"
+    "and        $f30, $f30, $f26                          \n\t"
+
+    "gslqc1     $f26, $f24, 640-384(%[tmp])               \n\t"
+    "gssqc1     $f30, $f28, 640-64(%[tmp])                \n\t"
+    "gslqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
+    "pandn      $f28, $f28, $f24                          \n\t"
+    "pandn      $f30, $f30, $f26                          \n\t"
+    "gssqc1     $f30, $f28, 640-304(%[tmp])               \n\t"
+    "gslqc1     $f30, $f28, 640-416(%[tmp])               \n\t"
+    "gslqc1     $f10, $f8, 640-624(%[tmp])                \n\t"
+    "paddh      $f28, $f28, $f24                          \n\t"
+    "paddh      $f30, $f30, $f26                          \n\t"
+    "paddh      $f28, $f28, $f20                          \n\t"
+    "paddh      $f30, $f30, $f22                          \n\t"
+    "paddh      $f28, $f28, $f8                           \n\t"
+    "paddh      $f30, $f30, $f10                          \n\t"
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+    "gslqc1     $f22, $f20, 640-560(%[tmp])               \n\t"
+    "psrah      $f28, $f28, $f10                          \n\t"
+    "psrah      $f30, $f30, $f10                          \n\t"
+    "and        $f20, $f20, $f28                          \n\t"
+    "and        $f22, $f22, $f30                          \n\t"
+    "gssqc1     $f22, $f20, 640-32(%[tmp])                \n\t"
+
+    "gslqc1     $f22, $f20, 640-480(%[tmp])               \n\t"
+    "gslqc1     $f2, $f0, 640-592(%[tmp])                 \n\t"
+    "gslqc1     $f10, $f8, 640-624(%[tmp])                \n\t"
+    "paddh      $f28, $f20, $f20                          \n\t"
+    "paddh      $f30, $f22, $f22                          \n\t"
+    "paddh      $f20, $f4, $f24                           \n\t"
+    "paddh      $f22, $f6, $f26                           \n\t"
+    "paddh      $f24, $f24, $f0                           \n\t"
+    "paddh      $f26, $f26, $f2                           \n\t"
+    "paddh      $f28, $f28, $f20                          \n\t"
+    "paddh      $f30, $f30, $f22                          \n\t"
+    "paddh      $f28, $f28, $f8                           \n\t"
+    "paddh      $f30, $f30, $f10                          \n\t"
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+    "gslqc1     $f22, $f20, 640-544(%[tmp])               \n\t"
+    "psrah      $f28, $f28, $f10                          \n\t"
+    "psrah      $f30, $f30, $f10                          \n\t"
+    "dli        %[iAlpha], 0x1                            \n\t"
+    "pandn      $f20, $f20, $f28                          \n\t"
+    "pandn      $f22, $f22, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 640-480(%[tmp])               \n\t"
+    "paddh      $f28, $f28, $f4                           \n\t"
+    "paddh      $f30, $f30, $f6                           \n\t"
+    "gslqc1     $f6, $f4, 640-400(%[tmp])                 \n\t"
+    "paddh      $f28, $f28, $f4                           \n\t"
+    "paddh      $f30, $f30, $f6                           \n\t"
+    "gslqc1     $f6, $f4, 640-544(%[tmp])                 \n\t"
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+    "gssqc1     $f22, $f20, 640-352(%[tmp])               \n\t"
+    "gslqc1     $f22, $f20, 640-368(%[tmp])               \n\t"
+    "psllh      $f28, $f28, $f10                          \n\t"
+    "psllh      $f30, $f30, $f10                          \n\t"
+    "dli        %[iAlpha], 0x3                            \n\t"
+    "paddh      $f28, $f28, $f24                          \n\t"
+    "paddh      $f30, $f30, $f26                          \n\t"
+    "paddh      $f20, $f20, $f28                          \n\t"
+    "paddh      $f22, $f22, $f30                          \n\t"
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+
+    "dli        %[iAlpha], 0x2                            \n\t"
+    "gslqc1     $f30, $f28, 640-400(%[tmp])               \n\t"
+    "psrah      $f20, $f20, $f10                          \n\t"
+    "psrah      $f22, $f22, $f10                          \n\t"
+    "and        $f4, $f4, $f20                            \n\t"
+    "and        $f6, $f6, $f22                            \n\t"
+    "gslqc1     $f22, $f20, 640-480(%[tmp])               \n\t"
+    "gssqc1     $f6, $f4, 640-96(%[tmp])                  \n\t"
+    "gslqc1     $f6, $f4, 640-384(%[tmp])                 \n\t"
+    "gslqc1     $f10, $f8, 640-400(%[tmp])                \n\t"
+    "paddh      $f24, $f4, $f4                            \n\t"
+    "paddh      $f26, $f6, $f6                            \n\t"
+    "paddh      $f4, $f4, $f8                             \n\t"
+    "paddh      $f6, $f6, $f10                            \n\t"
+    "gslqc1     $f10, $f8, 640-144(%[tmp])                \n\t"
+    "paddh      $f28, $f28, $f20                          \n\t"
+    "paddh      $f30, $f30, $f22                          \n\t"
+    "paddh      $f4, $f4, $f8                             \n\t"
+    "paddh      $f6, $f6, $f10                            \n\t"
+    "gslqc1     $f10, $f8, 640-592(%[tmp])                \n\t"
+    "paddh      $f24, $f24, $f28                          \n\t"
+    "paddh      $f26, $f26, $f30                          \n\t"
+    "paddh      $f20, $f20, $f8                           \n\t"
+    "paddh      $f22, $f22, $f10                          \n\t"
+    "gslqc1     $f10, $f8, 640-624(%[tmp])                \n\t"
+    "paddh      $f24, $f24, $f8                           \n\t"
+    "dmtc1      %[iAlpha], $f8                            \n\t"
+    "paddh      $f26, $f26, $f10                          \n\t"
+    "dli        %[iAlpha], 0x1                            \n\t"
+    "gslqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+    "psrah      $f24, $f24, $f8                           \n\t"
+    "psrah      $f26, $f26, $f8                           \n\t"
+    "psllh      $f4, $f4, $f10                            \n\t"
+    "psllh      $f6, $f6, $f10                            \n\t"
+    "paddh      $f4, $f4, $f20                            \n\t"
+    "paddh      $f6, $f6, $f22                            \n\t"
+    "dli        %[iAlpha], 0x3                            \n\t"
+
+    "gslqc1     $f22, $f20, 656-272(%[tmp])               \n\t"
+    "pandn      $f28, $f28, $f24                          \n\t"
+    "pandn      $f30, $f30, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 640-416(%[tmp])               \n\t"
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+    "paddh      $f24, $f24, $f4                           \n\t"
+    "paddh      $f26, $f26, $f6                           \n\t"
+    "gslqc1     $f6, $f4, 640-560(%[tmp])                 \n\t"
+    "psrah      $f24, $f24, $f10                          \n\t"
+    "psrah      $f26, $f26, $f10                          \n\t"
+    "and        $f4, $f4, $f24                            \n\t"
+    "and        $f6, $f6, $f26                            \n\t"
+
+    "xor        $f8, $f8, $f8                             \n\t"
+    "gslqc1     $f26, $f24, 704-272(%[tmp])               \n\t"
+    "gssqc1     $f6, $f4, 640-128(%[tmp])                 \n\t"
+    "gslqc1     $f6, $f4, 672-272(%[tmp])                 \n\t"
+    "punpcklbh  $f4, $f6, $f8                             \n\t"
+    "punpckhbh  $f6, $f6, $f8                             \n\t"
+    "gssqc1     $f6, $f4, 640-448(%[tmp])                 \n\t"
+    "gslqc1     $f6, $f4, 688-272(%[tmp])                 \n\t"
+    "punpcklbh  $f4, $f6, $f8                             \n\t"
+    "punpckhbh  $f6, $f6, $f8                             \n\t"
+    "punpcklbh  $f24, $f26, $f8                           \n\t"
+    "punpckhbh  $f26, $f26, $f8                           \n\t"
+    "gssqc1     $f30, $f28, 640-288(%[tmp])               \n\t"
+    "punpcklbh  $f20, $f22, $f8                           \n\t"
+    "punpckhbh  $f22, $f22, $f8                           \n\t"
+    "gslqc1     $f30, $f28, 0x0($14)                      \n\t"
+    "gssqc1     $f6, $f4, 640-496(%[tmp])                 \n\t"
+    "gssqc1     $f26, $f24, 640-432(%[tmp])               \n\t"
+
+    "gsldxc1    $f0, 0x8($15, $0)                         \n\t"
+    "punpcklbh  $f28, $f30, $f8                           \n\t"
+    "punpckhbh  $f30, $f30, $f8                           \n\t"
+    "gssqc1     $f30, $f28, 640-464(%[tmp])               \n\t"
+
+    "punpcklbh  $f28, $f0, $f8                            \n\t"
+    "punpckhbh  $f30, $f0, $f8                            \n\t"
+    "gslqc1     $f10, $f8, 640-464(%[tmp])                \n\t"
+    "gssqc1     $f30, $f28, 640-528(%[tmp])               \n\t"
+
+    "psubh      $f28, $f24, $f4                           \n\t"
+    "psubh      $f30, $f26, $f6                           \n\t"
+    "psubh      $f24, $f24, $f8                           \n\t"
+    "psubh      $f26, $f26, $f10                          \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+    "gslqc1     $f10, $f8, 640-16(%[tmp])                 \n\t"
+    "gssqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
+    "or         $f16, $f16, $f8                           \n\t"
+    "or         $f18, $f18, $f10                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+    "gslqc1     $f30, $f28, 640-448(%[tmp])               \n\t"
+    "psubh      $f28, $f4, $f28                           \n\t"
+    "psubh      $f30, $f6, $f30                           \n\t"
+
+    "gslqc1     $f2, $f0, 640-512(%[tmp])                 \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+    "pcmpgth    $f4, $f0, $f28                            \n\t"
+    "pcmpgth    $f6, $f2, $f30                            \n\t"
+    "pcmpgth    $f28, $f0, $f24                           \n\t"
+    "pcmpgth    $f30, $f2, $f26                           \n\t"
+    "gslqc1     $f26, $f24, 640-320(%[tmp])               \n\t"
+    "and        $f4, $f4, $f28                            \n\t"
+    "and        $f6, $f6, $f30                            \n\t"
+    "gslqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
+    "pcmpgth    $f24, $f24, $f28                          \n\t"
+    "pcmpgth    $f26, $f26, $f30                          \n\t"
+    "and        $f4, $f4, $f24                            \n\t"
+    "and        $f6, $f6, $f26                            \n\t"
+
+    "gslqc1     $f26, $f24, 640-576(%[tmp])               \n\t"
+    "pcmpgth    $f24, $f24, $f28                          \n\t"
+    "pcmpgth    $f26, $f26, $f30                          \n\t"
+    "xor        $f8, $f8, $f8                             \n\t"
+    "gslqc1     $f30, $f28, 640-496(%[tmp])               \n\t"
+    "punpcklbh  $f12, $f14, $f8                           \n\t"
+    "punpckhbh  $f14, $f14, $f8                           \n\t"
+    "gssqc1     $f26, $f24, 640-560(%[tmp])               \n\t"
+    "gslqc1     $f26, $f24, 640-512(%[tmp])               \n\t"
+    "psubh      $f28, $f28, $f20                          \n\t"
+    "psubh      $f30, $f30, $f22                          \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+    "pcmpgth    $f24, $f24, $f28                          \n\t"
+    "pcmpgth    $f26, $f26, $f30                          \n\t"
+
+    "dli        %[iAlpha], 0x1                            \n\t"
+    "gslqc1     $f10, $f8, 640-560(%[tmp])                \n\t"
+    "and        $f24, $f24, $f8                           \n\t"
+    "and        $f26, $f26, $f10                          \n\t"
+    "gslqc1     $f30, $f28, 640-432(%[tmp])               \n\t"
+    "gslqc1     $f10, $f8, 640-528(%[tmp])                \n\t"
+    "psubh      $f28, $f28, $f8                           \n\t"
+    "psubh      $f30, $f30, $f10                          \n\t"
+    "dmtc1      %[iAlpha], $f10                           \n\t"
+
+    "psllh      $f12, $f12, $f10                          \n\t"
+    "psllh      $f14, $f14, $f10                          \n\t"
+    "gssqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
+    "gslqc1     $f26, $f24, 640-512(%[tmp])               \n\t"
+
+    "gslqc1     $f10, $f8, 640-448(%[tmp])                \n\t"
+    "paddh      $f12, $f12, $f20                          \n\t"
+    "paddh      $f14, $f14, $f22                          \n\t"
+    "paddh      $f12, $f12, $f20                          \n\t"
+    "paddh      $f14, $f14, $f22                          \n\t"
+    "paddh      $f12, $f12, $f20                          \n\t"
+    "paddh      $f14, $f14, $f22                          \n\t"
+    "paddh      $f12, $f12, $f8                           \n\t"
+    "paddh      $f14, $f14, $f10                          \n\t"
+    "gslqc1     $f10, $f8, 640-496(%[tmp])                \n\t"
+    "gslqc1     $f2, $f0, 640-560(%[tmp])                 \n\t"
+    "paddh      $f12, $f12, $f8                           \n\t"
+    "paddh      $f14, $f14, $f10                          \n\t"
+    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
+    "pcmpgth    $f24, $f24, $f28                          \n\t"
+    "pcmpgth    $f26, $f26, $f30                          \n\t"
+    "and        $f24, $f24, $f0                           \n\t"
+    "and        $f26, $f26, $f2                           \n\t"
+    "gssqc1     $f26, $f24, 640-560(%[tmp])               \n\t"
+    "gslqc1     $f10, $f8, 640-544(%[tmp])                \n\t"
+
+    "gslqc1     $f2, $f0, 736-272(%[tmp])                 \n\t"
+    "dli        %[iAlpha], 0x3                            \n\t"
+    "gslqc1     $f30, $f28, 640-368(%[tmp])               \n\t"
+    "and        $f24, $f0, $f16                           \n\t"
+    "and        $f26, $f2, $f18                           \n\t"
+    "pandn      $f16, $f0, $f28                           \n\t"
+    "pandn      $f18, $f2, $f30                           \n\t"
+    "or         $f24, $f24, $f16                          \n\t"
+    "or         $f26, $f26, $f18                          \n\t"
+    "gslqc1     $f18, $f16, 640-432(%[tmp])               \n\t"
+    "paddh      $f12, $f12, $f16                          \n\t"
+    "paddh      $f14, $f14, $f18                          \n\t"
+    "gslqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
+    "paddh      $f12, $f12, $f28                          \n\t"
+    "paddh      $f14, $f14, $f30                          \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "psrah      $f12, $f12, $f28                          \n\t"
+    "psrah      $f14, $f14, $f28                          \n\t"
+    "and        $f12, $f12, $f8                           \n\t"
+    "and        $f14, $f14, $f10                          \n\t"
+    "pandn      $f8, $f8, $f20                            \n\t"
+    "pandn      $f10, $f10, $f22                          \n\t"
+    "or         $f12, $f12, $f8                           \n\t"
+    "or         $f14, $f14, $f10                          \n\t"
+    "and        $f28, $f4, $f12                           \n\t"
+    "and        $f30, $f6, $f14                           \n\t"
+    "gslqc1     $f14, $f12, 640-64(%[tmp])                \n\t"
+    "gslqc1     $f10, $f8, 640-336(%[tmp])                \n\t"
+    "or         $f12, $f12, $f8                           \n\t"
+    "or         $f14, $f14, $f10                          \n\t"
+    "pandn      $f8, $f4, $f20                            \n\t"
+    "pandn      $f10, $f6, $f22                           \n\t"
+    "or         $f28, $f28, $f8                           \n\t"
+    "or         $f30, $f30, $f10                          \n\t"
+
+    "dli        %[iAlpha], 0x2                            \n\t"
+    "and        $f8, $f0, $f12                            \n\t"
+    "and        $f10, $f2, $f14                           \n\t"
+    "gslqc1     $f14, $f12, 640-480(%[tmp])               \n\t"
+    "pandn      $f12, $f0, $f12                           \n\t"
+    "pandn      $f14, $f2, $f14                           \n\t"
+    "or         $f8, $f8, $f12                            \n\t"
+    "or         $f10, $f10, $f14                          \n\t"
+    "packushb   $f24, $f24, $f26                          \n\t"
+    "packushb   $f26, $f28, $f30                          \n\t"
+    "gssqc1     $f10, $f8, 640-336(%[tmp])                \n\t"
+    "gssqc1     $f26, $f24, 656-272(%[tmp])               \n\t"
+    "gslqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
+    "gslqc1     $f10, $f8, 640-448(%[tmp])                \n\t"
+    "paddh      $f8, $f20, $f8                            \n\t"
+    "paddh      $f10, $f22, $f10                          \n\t"
+    "gslqc1     $f30, $f28, 640-496(%[tmp])               \n\t"
+    "paddh      $f28, $f28, $f16                          \n\t"
+    "paddh      $f30, $f30, $f18                          \n\t"
+    "paddh      $f8, $f8, $f28                            \n\t"
+    "paddh      $f10, $f10, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
+    "paddh      $f8, $f8, $f28                            \n\t"
+    "paddh      $f10, $f10, $f30                          \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "psrah      $f8, $f8, $f28                            \n\t"
+    "psrah      $f10, $f10, $f28                          \n\t"
+    "dli        %[iAlpha], 0x1                            \n\t"
+    "gslqc1     $f30, $f28, 640-544(%[tmp])               \n\t"
+    "and        $f24, $f24, $f8                           \n\t"
+    "and        $f26, $f26, $f10                          \n\t"
+    "gslqc1     $f10, $f8, 640-448(%[tmp])                \n\t"
+    "pandn      $f28, $f28, $f8                           \n\t"
+    "pandn      $f30, $f30, $f10                          \n\t"
+    "or         $f24, $f24, $f28                          \n\t"
+    "or         $f26, $f26, $f30                          \n\t"
+    "and        $f12, $f4, $f24                           \n\t"
+    "and        $f14, $f6, $f26                           \n\t"
+    "pandn      $f24, $f4, $f8                            \n\t"
+    "pandn      $f26, $f6, $f10                           \n\t"
+    "gslqc1     $f30, $f28, 640-496(%[tmp])               \n\t"
+    "paddh      $f8, $f8, $f28                            \n\t"
+    "paddh      $f10, $f10, $f30                          \n\t"
+    "paddh      $f8, $f8, $f16                            \n\t"
+    "paddh      $f10, $f10, $f18                          \n\t"
+    "or         $f12, $f12, $f24                          \n\t"
+    "or         $f14, $f14, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 640-336(%[tmp])               \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "packushb   $f24, $f24, $f26                          \n\t"
+    "packushb   $f26, $f12, $f14                          \n\t"
+    "psllh      $f8, $f8, $f28                            \n\t"
+    "psllh      $f10, $f10, $f28                          \n\t"
+    "gssqc1     $f26, $f24, 672-272(%[tmp])               \n\t"
+    "gslqc1     $f26, $f24, 640-96(%[tmp])                \n\t"
+    "gslqc1     $f30, $f28, 640-352(%[tmp])               \n\t"
+    "or         $f24, $f24, $f28                          \n\t"
+    "or         $f26, $f26, $f30                          \n\t"
+    "dli        %[iAlpha], 0x3                            \n\t"
+
+    "and        $f12, $f0, $f24                           \n\t"
+    "and        $f14, $f2, $f26                           \n\t"
+    "gslqc1     $f26, $f24, 640-144(%[tmp])               \n\t"
+    "pandn      $f24, $f0, $f24                           \n\t"
+    "pandn      $f26, $f2, $f26                           \n\t"
+    "or         $f12, $f12, $f24                          \n\t"
+    "or         $f14, $f14, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
+    "gssqc1     $f14, $f12, 640-352(%[tmp])               \n\t"
+    "gslqc1     $f14, $f12, 640-464(%[tmp])               \n\t"
+    "gslqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
+    "paddh      $f12, $f12, $f28                          \n\t"
+    "paddh      $f14, $f14, $f30                          \n\t"
+    "paddh      $f8, $f8, $f12                            \n\t"
+    "paddh      $f10, $f10, $f14                          \n\t"
+    "gslqc1     $f14, $f12, 640-448(%[tmp])               \n\t"
+    "paddh      $f20, $f20, $f8                           \n\t"
+    "paddh      $f22, $f22, $f10                          \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "gslqc1     $f10, $f8, 640-496(%[tmp])                \n\t"
+    "psrah      $f20, $f20, $f28                          \n\t"
+    "psrah      $f22, $f22, $f28                          \n\t"
+    "and        $f24, $f24, $f20                          \n\t"
+    "and        $f26, $f26, $f22                          \n\t"
+    "gslqc1     $f22, $f20, 640-464(%[tmp])               \n\t"
+    "paddh      $f8, $f8, $f20                            \n\t"
+    "paddh      $f10, $f10, $f22                          \n\t"
+    "gslqc1     $f30, $f28, 640-432(%[tmp])               \n\t"
+    "dli        %[iAlpha], 0x2                            \n\t"
+    "paddh      $f20, $f20, $f28                          \n\t"
+    "paddh      $f22, $f22, $f30                          \n\t"
+    "paddh      $f16, $f12, $f12                          \n\t"
+    "paddh      $f18, $f14, $f14                          \n\t"
+    "paddh      $f16, $f16, $f8                           \n\t"
+    "paddh      $f18, $f18, $f10                          \n\t"
+    "gslqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
+    "paddh      $f16, $f16, $f28                          \n\t"
+    "paddh      $f18, $f18, $f30                          \n\t"
+    "gslqc1     $f10, $f8, 640-544(%[tmp])                \n\t"
+    "gslqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
+    "paddh      $f12, $f12, $f28                          \n\t"
+    "paddh      $f14, $f14, $f30                          \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "psrah      $f16, $f16, $f28                          \n\t"
+    "psrah      $f18, $f18, $f28                          \n\t"
+    "pandn      $f8, $f8, $f16                            \n\t"
+    "pandn      $f10, $f10, $f18                          \n\t"
+    "or         $f24, $f24, $f8                           \n\t"
+    "or         $f26, $f26, $f10                          \n\t"
+    "and        $f28, $f4, $f24                           \n\t"
+    "and        $f30, $f6, $f26                           \n\t"
+    "gslqc1     $f26, $f24, 640-496(%[tmp])               \n\t"
+    "pandn      $f8, $f4, $f24                            \n\t"
+    "pandn      $f10, $f6, $f26                           \n\t"
+    "or         $f28, $f28, $f8                           \n\t"
+    "or         $f30, $f30, $f10                          \n\t"
+    "gslqc1     $f10, $f8, 640-352(%[tmp])                \n\t"
+    "packushb   $f8, $f8, $f10                            \n\t"
+    "packushb   $f10, $f28, $f30                          \n\t"
+    "gssqc1     $f10, $f8, 688-272(%[tmp])                \n\t"
+    "gslqc1     $f10, $f8, 640-128(%[tmp])                \n\t"
+    "gslqc1     $f30, $f28, 640-288(%[tmp])               \n\t"
+    "or         $f8, $f8, $f28                            \n\t"
+    "or         $f10, $f10, $f30                          \n\t"
+    "dli        %[iAlpha], 0x1                            \n\t"
+
+    "and        $f16, $f0, $f8                            \n\t"
+    "and        $f18, $f2, $f10                           \n\t"
+    "paddh      $f20, $f20, $f24                          \n\t"
+    "paddh      $f22, $f22, $f26                          \n\t"
+    "gslqc1     $f30, $f28, 640-400(%[tmp])               \n\t"
+    "pandn      $f8, $f0, $f28                            \n\t"
+    "pandn      $f10, $f2, $f30                           \n\t"
+    "or         $f16, $f16, $f8                           \n\t"
+    "or         $f18, $f18, $f10                          \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "gslqc1     $f10, $f8, 640-528(%[tmp])                \n\t"
+    "dli        %[iAlpha], 0x3                            \n\t"
+    "psllh      $f20, $f20, $f28                          \n\t"
+    "psllh      $f22, $f22, $f28                          \n\t"
+    "paddh      $f20, $f20, $f12                          \n\t"
+    "paddh      $f22, $f22, $f14                          \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "gslqc1     $f14, $f12, 640-560(%[tmp])               \n\t"
+    "paddh      $f8, $f8, $f20                            \n\t"
+    "paddh      $f10, $f10, $f22                          \n\t"
+    "psrah      $f8, $f8, $f28                            \n\t"
+    "psrah      $f10, $f10, $f28                          \n\t"
+    "gssqc1     $f18, $f16, 640-288(%[tmp])               \n\t"
+    "gslqc1     $f18, $f16, 640-560(%[tmp])               \n\t"
+    "and        $f16, $f16, $f8                           \n\t"
+    "and        $f18, $f18, $f10                          \n\t"
+    "gslqc1     $f10, $f8, 640-464(%[tmp])                \n\t"
+    "paddh      $f20, $f8, $f8                            \n\t"
+    "paddh      $f22, $f10, $f10                          \n\t"
+    "gslqc1     $f10, $f8, 640-432(%[tmp])                \n\t"
+    "gslqc1     $f30, $f28, 640-448(%[tmp])               \n\t"
+    "paddh      $f8, $f8, $f28                            \n\t"
+    "paddh      $f10, $f10, $f30                          \n\t"
+    "dli        %[iAlpha], 0x2                            \n\t"
+    "paddh      $f20, $f20, $f8                           \n\t"
+    "paddh      $f22, $f22, $f10                          \n\t"
+    "gslqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
+    "paddh      $f20, $f20, $f28                          \n\t"
+    "paddh      $f22, $f22, $f30                          \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "gslqc1     $f26, $f24, 640-560(%[tmp])               \n\t"
+    "psrah      $f20, $f20, $f28                          \n\t"
+    "psrah      $f22, $f22, $f28                          \n\t"
+    "pandn      $f12, $f12, $f20                          \n\t"
+    "pandn      $f14, $f14, $f22                          \n\t"
+    "or         $f16, $f16, $f12                          \n\t"
+    "or         $f18, $f18, $f14                          \n\t"
+    "gslqc1     $f14, $f12, 640-32(%[tmp])                \n\t"
+    "gslqc1     $f30, $f28, 640-304(%[tmp])               \n\t"
+    "or         $f12, $f12, $f28                          \n\t"
+    "or         $f14, $f14, $f30                          \n\t"
+    "and        $f28, $f4, $f16                           \n\t"
+    "and        $f30, $f6, $f18                           \n\t"
+    "gslqc1     $f18, $f16, 640-432(%[tmp])               \n\t"
+    "gslqc1     $f22, $f20, 640-464(%[tmp])               \n\t"
+    "pandn      $f8, $f4, $f16                            \n\t"
+    "pandn      $f10, $f6, $f18                           \n\t"
+    "or         $f28, $f28, $f8                           \n\t"
+    "or         $f30, $f30, $f10                          \n\t"
+    "gslqc1     $f10, $f8, 640-496(%[tmp])                \n\t"
+    "paddh      $f16, $f16, $f8                           \n\t"
+    "paddh      $f18, $f18, $f10                          \n\t"
+    "gslqc1     $f10, $f8, 640-288(%[tmp])                \n\t"
+    "packushb   $f8, $f8, $f10                            \n\t"
+    "packushb   $f10, $f28, $f30                          \n\t"
+    "dli        %[iAlpha], 0x2                            \n\t"
+    "gssqc1     $f10, $f8, 704-272(%[tmp])                \n\t"
+
+    "and        $f8, $f0, $f12                            \n\t"
+    "and        $f10, $f2, $f14                           \n\t"
+    "gslqc1     $f30, $f28, 640-384(%[tmp])               \n\t"
+    "pandn      $f12, $f0, $f28                           \n\t"
+    "pandn      $f14, $f2, $f30                           \n\t"
+    "or         $f8, $f8, $f12                            \n\t"
+    "or         $f10, $f10, $f14                          \n\t"
+    "gssqc1     $f10, $f8, 640-304(%[tmp])                \n\t"
+    "gslqc1     $f10, $f8, 640-528(%[tmp])                \n\t"
+    "gslqc1     $f30, $f28, 640-464(%[tmp])               \n\t"
+    "paddh      $f12, $f8, $f28                           \n\t"
+    "paddh      $f14, $f10, $f30                          \n\t"
+    "paddh      $f12, $f12, $f16                          \n\t"
+    "paddh      $f14, $f14, $f18                          \n\t"
+    "gslqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
+    "paddh      $f12, $f12, $f28                          \n\t"
+    "paddh      $f14, $f14, $f30                          \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "psrah      $f12, $f12, $f28                          \n\t"
+    "psrah      $f14, $f14, $f28                          \n\t"
+    "and        $f24, $f24, $f12                          \n\t"
+    "and        $f26, $f26, $f14                          \n\t"
+    "gslqc1     $f14, $f12, 640-560(%[tmp])               \n\t"
+    "pandn      $f16, $f12, $f20                          \n\t"
+    "pandn      $f18, $f14, $f22                          \n\t"
+    "or         $f24, $f24, $f16                          \n\t"
+    "or         $f26, $f26, $f18                          \n\t"
+    "and        $f28, $f4, $f24                           \n\t"
+    "and        $f30, $f6, $f26                           \n\t"
+    "gslqc1     $f26, $f24, 640-304(%[tmp])               \n\t"
+    "pandn      $f16, $f4, $f20                           \n\t"
+    "pandn      $f18, $f6, $f22                           \n\t"
+    "or         $f28, $f28, $f16                          \n\t"
+    "or         $f30, $f30, $f18                          \n\t"
+    "dli        %[iAlpha], 0x1                            \n\t"
+
+    "packushb   $f24, $f24, $f26                          \n\t"
+    "packushb   $f26, $f28, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 640-112(%[tmp])               \n\t"
+    "gslqc1     $f18, $f16, 640-80(%[tmp])                \n\t"
+    "or         $f28, $f28, $f16                          \n\t"
+    "or         $f30, $f30, $f18                          \n\t"
+    "and        $f16, $f0, $f28                           \n\t"
+    "and        $f18, $f2, $f30                           \n\t"
+    "gslqc1     $f30, $f28, 640-416(%[tmp])               \n\t"
+    "pandn      $f0, $f0, $f28                            \n\t"
+    "pandn      $f2, $f2, $f30                            \n\t"
+    "or         $f16, $f16, $f0                           \n\t"
+    "or         $f18, $f18, $f2                           \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "xor        $f30, $f30, $f30                          \n\t"
+    "gslqc1     $f2, $f0, 0x0($12)                        \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "punpcklbh  $f0, $f2, $f30                            \n\t"
+    "punpckhbh  $f2, $f2, $f30                            \n\t"
+    "psllh      $f0, $f0, $f28                            \n\t"
+    "psllh      $f2, $f2, $f28                            \n\t"
+    "paddh      $f0, $f0, $f8                             \n\t"
+    "paddh      $f2, $f2, $f10                            \n\t"
+    "paddh      $f0, $f0, $f8                             \n\t"
+    "paddh      $f2, $f2, $f10                            \n\t"
+    "paddh      $f0, $f0, $f8                             \n\t"
+    "paddh      $f2, $f2, $f10                            \n\t"
+    "paddh      $f0, $f0, $f20                            \n\t"
+    "paddh      $f2, $f2, $f22                            \n\t"
+    "dli        %[iAlpha], 0x3                            \n\t"
+    "gslqc1     $f30, $f28, 640-432(%[tmp])               \n\t"
+    "paddh      $f0, $f0, $f28                            \n\t"
+    "paddh      $f2, $f2, $f30                            \n\t"
+    "gslqc1     $f30, $f28, 640-496(%[tmp])               \n\t"
+    "paddh      $f0, $f0, $f28                            \n\t"
+    "paddh      $f2, $f2, $f30                            \n\t"
+    "gslqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
+    "paddh      $f0, $f0, $f28                            \n\t"
+    "paddh      $f2, $f2, $f30                            \n\t"
+    "dmtc1      %[iAlpha], $f28                           \n\t"
+    "psrah      $f0, $f0, $f28                            \n\t"
+    "psrah      $f2, $f2, $f28                            \n\t"
+    "and        $f0, $f0, $f12                            \n\t"
+    "and        $f2, $f2, $f14                            \n\t"
+    "pandn      $f12, $f12, $f8                           \n\t"
+    "pandn      $f14, $f14, $f10                          \n\t"
+    "or         $f0, $f0, $f12                            \n\t"
+    "or         $f2, $f2, $f14                            \n\t"
+    "and        $f28, $f4, $f0                            \n\t"
+    "and        $f30, $f6, $f2                            \n\t"
+
+    "gslqc1     $f2, $f0, 656-272(%[tmp])                 \n\t"
+    "gssqc1     $f2, $f0, 0x0($11)                        \n\t"
+
+    "gslqc1     $f2, $f0, 672-272(%[tmp])                 \n\t"
+
+    "gssqc1     $f2, $f0, 0x0($8)                         \n\t"
+    "gslqc1     $f2, $f0, 688-272(%[tmp])                 \n\t"
+    "gssqc1     $f2, $f0, 0x0($9)                         \n\t"
+    "gslqc1     $f2, $f0, 704-272(%[tmp])                 \n\t"
+
+    "pandn      $f4, $f4, $f8                             \n\t"
+    "pandn      $f6, $f6, $f10                            \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPix])                    \n\t"
+    "or         $f28, $f28, $f4                           \n\t"
+    "or         $f30, $f30, $f6                           \n\t"
+    "packushb   $f16, $f16, $f18                          \n\t"
+    "packushb   $f18, $f28, $f30                          \n\t"
+    "gssqc1     $f26, $f24, 0x0($13)                      \n\t"
+    "gssqc1     $f18, $f16, 0x0(%[iStride])               \n\t"
+    : [pPix]"+&r"((unsigned char *)pPix)
+    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
+      [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0",
+      "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+      "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void DeblockChromaLt4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
+                           int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
+  unsigned char tmp[256] __attribute__((aligned(32)));
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "lb         $8, 0x2(%[pTC])                           \n\t"
+    "lb         $9, 0x3(%[pTC])                           \n\t"
+    "move       $11, $8                                   \n\t"
+    "lb         $8, 0x1(%[pTC])                           \n\t"
+    "lb         %[pTC], 0x0(%[pTC])                       \n\t"
+    "move       $12, %[pTC]                               \n\t"
+    "and        %[pTC], $9, 0xFFFF                        \n\t"
+    "dmtc1      %[pTC], $f4                               \n\t"
+    "and        %[pTC], $9, 0xFFFF                        \n\t"
+    "dmtc1      %[pTC], $f8                               \n\t"
+    "move       %[pTC], $11                               \n\t"
+    "and        $9, %[pTC], 0xFFFF                        \n\t"
+    "and        %[pTC], %[pTC], 0xFFFF                    \n\t"
+    "dmtc1      %[pTC], $f16                              \n\t"
+    "and        %[pTC], $8, 0xFFFF                        \n\t"
+    "dmtc1      %[pTC], $f20                              \n\t"
+    "dmtc1      $9, $f12                                  \n\t"
+    "and        %[pTC], $8, 0xFFFF                        \n\t"
+    "dmtc1      %[pTC], $f24                              \n\t"
+    "move       %[pTC], $12                               \n\t"
+    "and        $9, %[pTC], 0xFFFF                        \n\t"
+    "and        %[pTC], %[pTC], 0xFFFF                    \n\t"
+    "punpcklhw  $f24, $f24, $f8                           \n\t"
+    "xor        $f0, $f0, $f0                             \n\t"
+    "xor        $f2, $f2, $f2                             \n\t"
+    "gssqc1     $f2, $f0, 0x40(%[tmp])                    \n\t"
+    "dmtc1      $9, $f28                                  \n\t"
+    "dmtc1      %[pTC], $f0                               \n\t"
+    "daddu      %[pTC], %[iStride], %[iStride]            \n\t"
+    "dsubu      $9, %[pPixCb], %[pTC]                     \n\t"
+    "punpcklhw  $f20, $f20, $f4                           \n\t"
+    "gslqc1     $f6, $f4, 0x40(%[tmp])                    \n\t"
+    "punpcklhw  $f0, $f0, $f16                            \n\t"
+    "gsldxc1    $f16, 0x0(%[iStride], %[pPixCr])          \n\t"
+    "punpcklhw  $f28, $f28, $f12                          \n\t"
+    "gsldxc1    $f12, 0x0(%[pPixCb], $0)                  \n\t"
+    "punpcklhw  $f0, $f0, $f24                            \n\t"
+    "gsldxc1    $f24, 0x0($9, $0)                         \n\t"
+    "punpcklhw  $f28, $f28, $f20                          \n\t"
+    "punpckhhw  $f2, $f0, $f28                            \n\t"
+    "punpcklhw  $f0, $f0, $f28                            \n\t"
+    "dsubu      $9, %[pPixCr], %[pTC]                     \n\t"
+    "psubh      $f8, $f4, $f0                             \n\t"
+    "psubh      $f10, $f6, $f2                            \n\t"
+    "gssqc1     $f10, $f8, 0x60(%[tmp])                   \n\t"
+    "gsldxc1    $f8, 0x0($9, $0)                          \n\t"
+    "mov.d      $f26, $f8                                 \n\t"
+    "dsubu      %[pTC], %[pPixCb], %[iStride]             \n\t"
+    "gsldxc1    $f28, 0x0(%[pTC], $0)                     \n\t"
+    "dsubu      $9, %[pPixCr], %[iStride]                 \n\t"
+    "gsldxc1    $f8, 0x0($9, $0)                          \n\t"
+    "mov.d      $f30, $f8                                 \n\t"
+    "gsldxc1    $f8, 0x0(%[pPixCr], $0)                   \n\t"
+    "mov.d      $f14, $f8                                 \n\t"
+    "gsldxc1    $f8, 0x0(%[iStride], %[pPixCb])           \n\t"
+    "mov.d      $f10, $f16                                \n\t"
+    "gssqc1     $f10, $f8, 0xE0(%[tmp])                   \n\t"
+    "dmtc1      %[iAlpha], $f8                            \n\t"
+    "punpcklhw  $f16, $f8, $f8                            \n\t"
+    "dmtc1      %[iBeta], $f8                             \n\t"
+    "punpcklhw  $f20, $f8, $f8                            \n\t"
+    "punpcklwd  $f8, $f20, $f20                           \n\t"
+    "mov.d      $f10, $f8                                 \n\t"
+    "gssqc1     $f10, $f8, 0x50(%[tmp])                   \n\t"
+    "punpckhbh  $f10, $f24, $f4                           \n\t"
+    "punpcklbh  $f8, $f24, $f4                            \n\t"
+    "gssqc1     $f14, $f12, 0xd0(%[tmp])                  \n\t"
+    "punpcklwd  $f16, $f16, $f16                          \n\t"
+    "mov.d      $f18, $f16                                \n\t"
+    "gssqc1     $f10, $f8, 0x30(%[tmp])                   \n\t"
+    "punpcklbh  $f24, $f26, $f6                           \n\t"
+    "punpckhbh  $f26, $f26, $f6                           \n\t"
+    "gssqc1     $f26, $f24, 0x80(%[tmp])                  \n\t"
+    "gslqc1     $f26, $f24, 0xd0(%[tmp])                  \n\t"
+    "punpcklbh  $f24, $f26, $f6                           \n\t"
+    "punpckhbh  $f26, $f26, $f6                           \n\t"
+    "gssqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
+    "gslqc1     $f26, $f24, 0xe0(%[tmp])                  \n\t"
+    "punpcklbh  $f24, $f26, $f6                           \n\t"
+    "punpckhbh  $f26, $f26, $f6                           \n\t"
+    "gssqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
+    "gslqc1     $f22, $f20, 0xe0(%[tmp])                  \n\t"
+    "mov.d      $f8, $f28                                 \n\t"
+    "mov.d      $f10, $f30                                \n\t"
+    "punpcklbh  $f28, $f30, $f6                           \n\t"
+    "punpckhbh  $f30, $f30, $f6                           \n\t"
+    "punpckhbh  $f22, $f20, $f4                           \n\t"
+    "punpcklbh  $f20, $f20, $f4                           \n\t"
+    "gssqc1     $f30, $f28, 0xa0(%[tmp])                  \n\t"
+    "punpckhbh  $f14, $f12, $f4                           \n\t"
+    "punpcklbh  $f12, $f12, $f4                           \n\t"
+    "dli        %[iBeta], 0x4                             \n\t"
+    "punpckhbh  $f10, $f8, $f4                            \n\t"
+    "punpcklbh  $f8, $f8, $f4                             \n\t"
+    "dmtc1      %[iBeta], $f24                            \n\t"
+    "punpcklhw  $f28, $f24, $f24                          \n\t"
+    "punpcklwd  $f24, $f28, $f28                          \n\t"
+    "mov.d      $f26, $f24                                \n\t"
+    "gslqc1     $f30, $f28, 0x30(%[tmp])                  \n\t"
+    "gssqc1     $f26, $f24, 0x20(%[tmp])                  \n\t"
+    "psubh      $f28, $f28, $f20                          \n\t"
+    "psubh      $f30, $f30, $f22                          \n\t"
+    "pcmpgth    $f24, $f0, $f4                            \n\t"
+    "pcmpgth    $f26, $f2, $f6                            \n\t"
+    "gslqc1     $f6, $f4, 0x60(%[tmp])                    \n\t"
+    "gssqc1     $f26, $f24, 0x40(%[tmp])                  \n\t"
+    "psubh      $f24, $f12, $f8                           \n\t"
+    "psubh      $f26, $f14, $f10                          \n\t"
+    "dmfc1      %[iAlpha], $f12                           \n\t"
+    "dmfc1      %[iBeta], $f14                            \n\t"
+    "dli        $10, 0x2                                  \n\t"
+    "dmtc1      $10, $f12                                 \n\t"
+    "dli        $10, 0x3                                  \n\t"
+    "dmtc1      $10, $f14                                 \n\t"
+    "psllh      $f24, $f24, $f12                          \n\t"
+    "psllh      $f26, $f26, $f12                          \n\t"
+    "paddh      $f24, $f24, $f28                          \n\t"
+    "paddh      $f26, $f26, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 0x20(%[tmp])                  \n\t"
+    "paddh      $f24, $f24, $f28                          \n\t"
+    "paddh      $f26, $f26, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
+    "psrah      $f24, $f24, $f14                          \n\t"
+    "psrah      $f26, $f26, $f14                          \n\t"
+    "dmtc1      %[iAlpha], $f12                           \n\t"
+    "dmtc1      %[iBeta], $f14                            \n\t"
+    "pmaxsh     $f4, $f4, $f24                            \n\t"
+    "pmaxsh     $f6, $f6, $f26                            \n\t"
+    "gssqc1     $f2, $f0, 0x10(%[tmp])                    \n\t"
+    "gslqc1     $f26, $f24, 0x10(%[tmp])                  \n\t"
+    "pminsh     $f24, $f24, $f4                           \n\t"
+    "pminsh     $f26, $f26, $f6                           \n\t"
+    "gssqc1     $f26, $f24, 0x10(%[tmp])                  \n\t"
+    "psubh      $f4, $f8, $f12                            \n\t"
+    "psubh      $f6, $f10, $f14                           \n\t"
+    WELS_AbsH($f4, $f6, $f4, $f6, $f24, $f26)
+    "pcmpgth    $f24, $f16, $f4                           \n\t"
+    "pcmpgth    $f26, $f18, $f6                           \n\t"
+    "gslqc1     $f6, $f4, 0x30(%[tmp])                    \n\t"
+    "psubh      $f4, $f4, $f8                             \n\t"
+    "psubh      $f6, $f6, $f10                            \n\t"
+    "dmfc1      %[iAlpha], $f8                            \n\t"
+    "dmfc1      %[iBeta], $f10                            \n\t"
+    WELS_AbsH($f4, $f6, $f4, $f6, $f8, $f10)
+    "pcmpgth    $f28, $f28, $f4                           \n\t"
+    "pcmpgth    $f30, $f30, $f6                           \n\t"
+    "gslqc1     $f6, $f4, 0x50(%[tmp])                    \n\t"
+    "and        $f24, $f24, $f28                          \n\t"
+    "and        $f26, $f26, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
+    "psubh      $f20, $f20, $f12                          \n\t"
+    "psubh      $f22, $f22, $f14                          \n\t"
+    WELS_AbsH($f20, $f22, $f20, $f22, $f8, $f10)
+    "pcmpgth    $f4, $f4, $f20                            \n\t"
+    "pcmpgth    $f6, $f6, $f22                            \n\t"
+    "gslqc1     $f22, $f20, 0x80(%[tmp])                  \n\t"
+    "gslqc1     $f10, $f8, 0x90(%[tmp])                   \n\t"
+    "psubh      $f20, $f20, $f8                           \n\t"
+    "psubh      $f22, $f22, $f10                          \n\t"
+    "and        $f24, $f24, $f4                           \n\t"
+    "and        $f26, $f26, $f6                           \n\t"
+    "gslqc1     $f10, $f8, 0x40(%[tmp])                   \n\t"
+    "and        $f24, $f24, $f8                           \n\t"
+    "and        $f26, $f26, $f10                          \n\t"
+    "gslqc1     $f6, $f4, 0x10(%[tmp])                    \n\t"
+    "and        $f4, $f4, $f24                            \n\t"
+    "and        $f6, $f6, $f26                            \n\t"
+    "gslqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
+    "gssqc1     $f6, $f4, 0x30(%[tmp])                    \n\t"
+    "gslqc1     $f6, $f4, 0xa0(%[tmp])                    \n\t"
+    "psubh      $f24, $f24, $f4                           \n\t"
+    "psubh      $f26, $f26, $f6                           \n\t"
+    "dli        $10, 0x2                                  \n\t"
+    "dmtc1      $10, $f8                                  \n\t"
+    "psllh      $f24, $f24, $f8                           \n\t"
+    "psllh      $f26, $f26, $f8                           \n\t"
+    "paddh      $f24, $f24, $f20                          \n\t"
+    "paddh      $f26, $f26, $f22                          \n\t"
+    "dli        $10, 0x3                                  \n\t"
+    "gslqc1     $f10, $f8, 0x20(%[tmp])                   \n\t"
+    "paddh      $f24, $f24, $f8                           \n\t"
+    "paddh      $f26, $f26, $f10                          \n\t"
+    "dmtc1      $10, $f8                                  \n\t"
+    "gslqc1     $f22, $f20, 0x60(%[tmp])                  \n\t"
+    "psrah      $f24, $f24, $f8                           \n\t"
+    "psrah      $f26, $f26, $f8                           \n\t"
+    "pmaxsh     $f20, $f20, $f24                          \n\t"
+    "pmaxsh     $f22, $f22, $f26                          \n\t"
+    "pminsh     $f0, $f0, $f20                            \n\t"
+    "pminsh     $f2, $f2, $f22                            \n\t"
+    "gslqc1     $f22, $f20, 0x70(%[tmp])                  \n\t"
+    "psubh      $f24, $f4, $f20                           \n\t"
+    "psubh      $f26, $f6, $f22                           \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
+    "pcmpgth    $f16, $f16, $f24                          \n\t"
+    "pcmpgth    $f18, $f18, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 0x80(%[tmp])                  \n\t"
+    "psubh      $f24, $f24, $f4                           \n\t"
+    "psubh      $f26, $f26, $f6                           \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
+    "pcmpgth    $f28, $f28, $f24                          \n\t"
+    "pcmpgth    $f30, $f30, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
+    "and        $f16, $f16, $f28                          \n\t"
+    "and        $f18, $f18, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
+    "psubh      $f24, $f24, $f20                          \n\t"
+    "psubh      $f26, $f26, $f22                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
+    "dmtc1      %[iAlpha], $f8                            \n\t"
+    "dmtc1      %[iBeta], $f10                            \n\t"
+    "pcmpgth    $f28, $f28, $f24                          \n\t"
+    "pcmpgth    $f30, $f30, $f26                          \n\t"
+    "and        $f16, $f16, $f28                          \n\t"
+    "and        $f18, $f18, $f30                          \n\t"
+    "gslqc1     $f26, $f24, 0x40(%[tmp])                  \n\t"
+    "and        $f16, $f16, $f24                          \n\t"
+    "and        $f18, $f18, $f26                          \n\t"
+    "and        $f0, $f0, $f16                            \n\t"
+    "and        $f2, $f2, $f18                            \n\t"
+    "gslqc1     $f18, $f16, 0x30(%[tmp])                  \n\t"
+    "paddh      $f8, $f8, $f16                            \n\t"
+    "paddh      $f10, $f10, $f18                          \n\t"
+    "paddh      $f4, $f4, $f0                             \n\t"
+    "paddh      $f6, $f6, $f2                             \n\t"
+    "packushb   $f8, $f8, $f10                            \n\t"
+    "packushb   $f10, $f4, $f6                            \n\t"
+    "gssdxc1    $f8, 0x0(%[pTC], $0)                      \n\t"
+    "psubh      $f12, $f12, $f16                          \n\t"
+    "psubh      $f14, $f14, $f18                          \n\t"
+    "psubh      $f20, $f20, $f0                           \n\t"
+    "psubh      $f22, $f22, $f2                           \n\t"
+    "packushb   $f12, $f12, $f14                          \n\t"
+    "packushb   $f14, $f20, $f22                          \n\t"
+    "gssdxc1    $f12, 0x0(%[pPixCb], $0)                  \n\t"
+    "gssdxc1    $f10, 0x0($9, $0)                         \n\t"
+    "gssdxc1    $f14, 0x0(%[pPixCr], $0)                  \n\t"
+    : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
+    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
+      [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
+      "$f10", "$f12",  "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
+      "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void DeblockChromaEq4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
+                           int32_t iAlpha, int32_t iBeta) {
+  unsigned char tmp[128] __attribute__((aligned(32)));
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                          \n\t"
+    "daddu      $8, %[iStride], %[iStride]               \n\t"
+    "dsubu      $9, %[pPixCb], $8                        \n\t"
+    "gsldxc1    $f16, 0x0(%[pPixCr], $0)                 \n\t"
+    "gsldxc1    $f20, 0x0(%[iStride], %[pPixCr])         \n\t"
+    "gsldxc1    $f4, 0x0($9, $0)                         \n\t"
+    "dsubu      $9, %[pPixCr], $8                        \n\t"
+    "gsldxc1    $f8, 0x0($9, $0)                         \n\t"
+    "mov.d      $f6, $f8                                 \n\t"
+    "dsubu      $8, %[pPixCb], %[iStride]                \n\t"
+    "gsldxc1    $f8, 0x0($8, $0)                         \n\t"
+    "dsubu      $9, %[pPixCr], %[iStride]                \n\t"
+    "gsldxc1    $f12, 0x0($9, $0)                        \n\t"
+    "mov.d      $f10, $f12                               \n\t"
+    "gsldxc1    $f12, 0x0(%[pPixCb], $0)                 \n\t"
+    "mov.d      $f14, $f16                               \n\t"
+    "gsldxc1    $f16, 0x0(%[iStride], %[pPixCb])         \n\t"
+    "mov.d      $f18, $f20                               \n\t"
+    "dmtc1      %[iAlpha], $f20                          \n\t"
+    "xor        $f0, $f0, $f0                            \n\t"
+    "xor        $f2, $f2, $f2                            \n\t"
+    "punpcklhw  $f24, $f20, $f20                         \n\t"
+    "punpcklwd  $f20, $f24, $f24                         \n\t"
+    "mov.d      $f22, $f20                               \n\t"
+    "dmtc1      %[iBeta], $f24                           \n\t"
+    "punpcklhw  $f28, $f24, $f24                         \n\t"
+    "punpcklwd  $f24, $f28, $f28                         \n\t"
+    "mov.d      $f26, $f24                               \n\t"
+    "mov.d      $f28, $f4                                \n\t"
+    "punpcklbh  $f4, $f6, $f2                            \n\t"
+    "punpckhbh  $f6, $f6, $f2                            \n\t"
+    "punpckhbh  $f30, $f28, $f0                          \n\t"
+    "punpcklbh  $f28, $f28, $f0                          \n\t"
+    "gssqc1     $f6, $f4, 0x40(%[tmp])                   \n\t"
+    "gssqc1     $f30, $f28, 0x60(%[tmp])                 \n\t"
+    "punpckhbh  $f30, $f8, $f0                           \n\t"
+    "punpcklbh  $f28, $f8, $f0                           \n\t"
+    "gssqc1     $f30, $f28, 0x10(%[tmp])                 \n\t"
+    "punpckhbh  $f30, $f12, $f0                          \n\t"
+    "punpcklbh  $f28, $f12, $f0                          \n\t"
+    "punpcklbh  $f12, $f14, $f2                          \n\t"
+    "punpckhbh  $f14, $f14, $f2                          \n\t"
+    "gssqc1     $f30, $f28, 0x50(%[tmp])                 \n\t"
+    "mov.d      $f28, $f16                               \n\t"
+    "punpcklbh  $f16, $f18, $f2                          \n\t"
+    "punpckhbh  $f18, $f18, $f2                          \n\t"
+    "punpcklbh  $f8, $f10, $f2                           \n\t"
+    "punpckhbh  $f10, $f10, $f2                          \n\t"
+    "punpckhbh  $f30, $f28, $f0                          \n\t"
+    "punpcklbh  $f28, $f28, $f0                          \n\t"
+    "gssqc1     $f14, $f12, 0x30(%[tmp])                 \n\t"
+    "gslqc1     $f14, $f12, 0x10(%[tmp])                 \n\t"
+    "gslqc1     $f2, $f0, 0x50(%[tmp])                   \n\t"
+    "psubh      $f4, $f12, $f0                           \n\t"
+    "psubh      $f6, $f14, $f2                           \n\t"
+    WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
+    "gssqc1     $f18, $f16, 0x20(%[tmp])                 \n\t"
+    "pcmpgth    $f0, $f20, $f4                           \n\t"
+    "pcmpgth    $f2, $f22, $f6                           \n\t"
+    "gslqc1     $f6, $f4, 0x60(%[tmp])                   \n\t"
+    "psubh      $f4, $f4, $f12                           \n\t"
+    "psubh      $f6, $f6, $f14                           \n\t"
+    WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
+    "pcmpgth    $f16, $f24, $f4                          \n\t"
+    "pcmpgth    $f18, $f26, $f6                          \n\t"
+    "and        $f0, $f0, $f16                           \n\t"
+    "and        $f2, $f2, $f18                           \n\t"
+    "gslqc1     $f18, $f16, 0x50(%[tmp])                 \n\t"
+    "psubh      $f4, $f28, $f16                          \n\t"
+    "psubh      $f6, $f30, $f18                          \n\t"
+    WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
+    "pcmpgth    $f16, $f24, $f4                          \n\t"
+    "pcmpgth    $f18, $f26, $f6                          \n\t"
+    "gslqc1     $f6, $f4, 0x30(%[tmp])                   \n\t"
+    "psubh      $f4, $f8, $f4                            \n\t"
+    "psubh      $f6, $f10, $f6                           \n\t"
+    "dmfc1      %[iAlpha], $f28                          \n\t"
+    "dmfc1      %[iBeta], $f30                           \n\t"
+    WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
+    "pcmpgth    $f20, $f20, $f4                          \n\t"
+    "pcmpgth    $f22, $f22, $f6                          \n\t"
+    "gslqc1     $f6, $f4, 0x40(%[tmp])                   \n\t"
+    "and        $f0, $f0, $f16                           \n\t"
+    "and        $f2, $f2, $f18                           \n\t"
+    "psubh      $f4, $f4, $f8                            \n\t"
+    "psubh      $f6, $f6, $f10                           \n\t"
+    WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
+    "pcmpgth    $f16, $f24, $f4                          \n\t"
+    "pcmpgth    $f18, $f26, $f6                          \n\t"
+    "gslqc1     $f6, $f4, 0x20(%[tmp])                   \n\t"
+    "gslqc1     $f30, $f28, 0x30(%[tmp])                 \n\t"
+    "psubh      $f4, $f4, $f28                           \n\t"
+    "psubh      $f6, $f6, $f30                           \n\t"
+    "and        $f20, $f20, $f16                         \n\t"
+    "and        $f22, $f22, $f18                         \n\t"
+    WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
+    "dmtc1      %[iAlpha], $f28                          \n\t"
+    "dmtc1      %[iBeta], $f30                           \n\t"
+    "pcmpgth    $f24, $f24, $f4                          \n\t"
+    "pcmpgth    $f26, $f26, $f6                          \n\t"
+    "and        $f20, $f20, $f24                         \n\t"
+    "and        $f22, $f22, $f26                         \n\t"
+    "dli        %[iBeta], 0x2                            \n\t"
+    "dmtc1      %[iBeta], $f4                            \n\t"
+    "punpcklhw  $f16, $f4, $f4                           \n\t"
+    "punpcklwd  $f4, $f16, $f16                          \n\t"
+    "mov.d      $f6, $f4                                 \n\t"
+    "gslqc1     $f18, $f16, 0x60(%[tmp])                 \n\t"
+    "paddh      $f24, $f16, $f16                         \n\t"
+    "paddh      $f26, $f18, $f18                         \n\t"
+    "paddh      $f24, $f24, $f12                         \n\t"
+    "paddh      $f26, $f26, $f14                         \n\t"
+    "paddh      $f24, $f24, $f28                         \n\t"
+    "paddh      $f26, $f26, $f30                         \n\t"
+    "gssqc1     $f6, $f4, 0x10(%[tmp])                   \n\t"
+    "gslqc1     $f18, $f16, 0x10(%[tmp])                 \n\t"
+    "paddh      $f24, $f24, $f16                         \n\t"
+    "paddh      $f26, $f26, $f18                         \n\t"
+    "dmtc1      %[iBeta], $f16                           \n\t"
+    "psrah      $f24, $f24, $f16                         \n\t"
+    "psrah      $f26, $f26, $f16                         \n\t"
+    "pandn      $f16, $f0, $f12                          \n\t"
+    "pandn      $f18, $f2, $f14                          \n\t"
+    "gslqc1     $f14, $f12, 0x40(%[tmp])                 \n\t"
+    "and        $f4, $f0, $f24                           \n\t"
+    "and        $f6, $f2, $f26                           \n\t"
+    "or         $f4, $f4, $f16                           \n\t"
+    "or         $f6, $f6, $f18                           \n\t"
+    "paddh      $f24, $f12, $f12                         \n\t"
+    "paddh      $f26, $f14, $f14                         \n\t"
+    "gslqc1     $f14, $f12, 0x10(%[tmp])                 \n\t"
+    "paddh      $f24, $f24, $f8                          \n\t"
+    "paddh      $f26, $f26, $f10                         \n\t"
+    "gslqc1     $f18, $f16, 0x20(%[tmp])                 \n\t"
+    "paddh      $f24, $f24, $f16                         \n\t"
+    "paddh      $f26, $f26, $f18                         \n\t"
+    "dmtc1      %[iBeta], $f16                           \n\t"
+    "paddh      $f24, $f24, $f12                         \n\t"
+    "paddh      $f26, $f26, $f14                         \n\t"
+    "psrah      $f24, $f24, $f16                         \n\t"
+    "psrah      $f26, $f26, $f16                         \n\t"
+    "and        $f16, $f20, $f24                         \n\t"
+    "and        $f18, $f22, $f26                         \n\t"
+    "pandn      $f24, $f20, $f8                          \n\t"
+    "pandn      $f26, $f22, $f10                         \n\t"
+    "or         $f16, $f16, $f24                         \n\t"
+    "or         $f18, $f18, $f26                         \n\t"
+    "packushb   $f4, $f4, $f6                            \n\t"
+    "packushb   $f6, $f16, $f18                          \n\t"
+    "gslqc1     $f18, $f16, 0x50(%[tmp])                 \n\t"
+    "paddh      $f24, $f28, $f28                         \n\t"
+    "paddh      $f26, $f30, $f30                         \n\t"
+    "paddh      $f24, $f24, $f16                         \n\t"
+    "paddh      $f26, $f26, $f18                         \n\t"
+    "gslqc1     $f10, $f8, 0x60(%[tmp])                  \n\t"
+    "paddh      $f24, $f24, $f8                          \n\t"
+    "paddh      $f26, $f26, $f10                         \n\t"
+    "dmtc1      %[iBeta], $f28                           \n\t"
+    "paddh      $f24, $f24, $f12                         \n\t"
+    "paddh      $f26, $f26, $f14                         \n\t"
+    "psrah      $f24, $f24, $f28                         \n\t"
+    "psrah      $f26, $f26, $f28                         \n\t"
+    "and        $f8, $f0, $f24                           \n\t"
+    "and        $f10, $f2, $f26                          \n\t"
+    "pandn      $f0, $f0, $f16                           \n\t"
+    "pandn      $f2, $f2, $f18                           \n\t"
+    "or         $f8, $f8, $f0                            \n\t"
+    "or         $f10, $f10, $f2                          \n\t"
+    "gslqc1     $f2, $f0, 0x20(%[tmp])                   \n\t"
+    "paddh      $f24, $f0, $f0                           \n\t"
+    "paddh      $f26, $f2, $f2                           \n\t"
+    "gslqc1     $f2, $f0, 0x30(%[tmp])                   \n\t"
+    "paddh      $f24, $f24, $f0                          \n\t"
+    "paddh      $f26, $f26, $f2                          \n\t"
+    "gslqc1     $f18, $f16, 0x40(%[tmp])                 \n\t"
+    "paddh      $f24, $f24, $f16                         \n\t"
+    "paddh      $f26, $f26, $f18                         \n\t"
+    "paddh      $f24, $f24, $f12                         \n\t"
+    "paddh      $f26, $f26, $f14                         \n\t"
+    "gssdxc1    $f4, 0x0($8, $0)                         \n\t"
+    "psrah      $f24, $f24, $f28                         \n\t"
+    "psrah      $f26, $f26, $f28                         \n\t"
+    "and        $f16, $f20, $f24                         \n\t"
+    "and        $f18, $f22, $f26                         \n\t"
+    "pandn      $f20, $f20, $f0                          \n\t"
+    "pandn      $f22, $f22, $f2                          \n\t"
+    "or         $f16, $f16, $f20                         \n\t"
+    "or         $f18, $f18, $f22                         \n\t"
+    "packushb   $f8, $f8, $f10                           \n\t"
+    "packushb   $f10, $f16, $f18                         \n\t"
+    "gssdxc1    $f8, 0x0(%[pPixCb], $0)                  \n\t"
+    "gssdxc1    $f6, 0x0($9, $0)                         \n\t"
+    "gssdxc1    $f10, 0x0(%[pPixCr], $0)                 \n\t"
+    : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
+    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
+      [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
+      "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
+      "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void DeblockChromaEq4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
+                           int32_t iAlpha, int32_t iBeta) {
+  unsigned char tmp[256] __attribute__((aligned(32)));
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "daddiu     %[pPixCb], %[pPixCb], -0x2                \n\t"
+    "daddiu     %[pPixCr], %[pPixCr], -0x2                \n\t"
+    "move       $9, %[pPixCb]                             \n\t"
+    "move       $10, %[pPixCr]                            \n\t"
+    "dsll       $11, %[iStride], 0x2                      \n\t"
+    "daddu      %[pPixCb], %[pPixCb], $11                 \n\t"
+    "daddu      %[pPixCr], %[pPixCr], $11                 \n\t"
+    "daddiu     $11, %[tmp], 0x80                         \n\t"
+    "gsldlc1    $f0, 0x7($9)                              \n\t"
+    "gsldrc1    $f0, 0x0($9)                              \n\t"
+    "daddu      $12, $9, %[iStride]                       \n\t"
+    "gsldlc1    $f4, 0x7($12)                             \n\t"
+    "gsldrc1    $f4, 0x0($12)                             \n\t"
+    "daddu      $12, $12, %[iStride]                      \n\t"
+    "gsldlc1    $f8, 0x7($12)                             \n\t"
+    "gsldrc1    $f8, 0x0($12)                             \n\t"
+    "daddu      $12, $12, %[iStride]                      \n\t"
+    "gsldlc1    $f12, 0x7($12)                            \n\t"
+    "gsldlc1    $f16, 0x7($10)                            \n\t"
+    "gsldrc1    $f12, 0x0($12)                            \n\t"
+    "gsldrc1    $f16, 0x0($10)                            \n\t"
+    "daddu      $12, $10, %[iStride]                      \n\t"
+    "gsldlc1    $f20, 0x7($12)                            \n\t"
+    "gsldrc1    $f20, 0x0($12)                            \n\t"
+    "daddu      $12, $12, %[iStride]                      \n\t"
+    "gsldlc1    $f24, 0x7($12)                            \n\t"
+    "gsldrc1    $f24, 0x0($12)                            \n\t"
+    "daddu      $12, $12, %[iStride]                      \n\t"
+    "gsldlc1    $f28, 0x7($12)                            \n\t"
+    "gsldrc1    $f28, 0x0($12)                            \n\t"
+    "punpcklwd  $f0, $f0, $f16                            \n\t"
+    "punpcklwd  $f4, $f4, $f20                            \n\t"
+    "punpcklwd  $f8, $f8, $f24                            \n\t"
+    "punpcklwd  $f12, $f12, $f28                          \n\t"
+    "gsldlc1    $f16, 0x7(%[pPixCb])                      \n\t"
+    "gsldlc1    $f20, 0x7(%[pPixCr])                      \n\t"
+    "gsldrc1    $f16, 0x0(%[pPixCb])                      \n\t"
+    "gsldrc1    $f20, 0x0(%[pPixCr])                      \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "mov.d      $f2, $f16                                 \n\t"
+    "daddu      $12, %[pPixCb], %[iStride]                \n\t"
+    "daddu      $13, %[pPixCr], %[iStride]                \n\t"
+    "gsldlc1    $f16, 0x7($12)                            \n\t"
+    "gsldlc1    $f20, 0x7($13)                            \n\t"
+    "gsldrc1    $f16, 0x0($12)                            \n\t"
+    "gsldrc1    $f20, 0x0($13)                            \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "mov.d      $f6, $f16                                 \n\t"
+    "daddu      $12, $12, %[iStride]                      \n\t"
+    "daddu      $13, $13, %[iStride]                      \n\t"
+    "gsldlc1    $f16, 0x7($12)                            \n\t"
+    "gsldlc1    $f20, 0x7($13)                            \n\t"
+    "gsldrc1    $f16, 0x0($12)                            \n\t"
+    "gsldrc1    $f20, 0x0($13)                            \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "mov.d      $f10, $f16                                \n\t"
+    "daddu      $12, $12, %[iStride]                      \n\t"
+    "daddu      $13, $13, %[iStride]                      \n\t"
+    "gsldlc1    $f16, 0x7($12)                            \n\t"
+    "gsldlc1    $f20, 0x7($13)                            \n\t"
+    "gsldrc1    $f16, 0x0($12)                            \n\t"
+    "gsldrc1    $f20, 0x0($13)                            \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "mov.d      $f14, $f16                                \n\t"
+    "punpcklbh  $f24, $f2, $f6                            \n\t"
+    "punpckhbh  $f26, $f2, $f6                            \n\t"
+    "punpckhbh  $f2, $f0, $f4                             \n\t"
+    "punpcklbh  $f0, $f0, $f4                             \n\t"
+    "punpcklbh  $f28, $f10, $f14                          \n\t"
+    "punpckhbh  $f30, $f10, $f14                          \n\t"
+    "punpckhbh  $f10, $f8, $f12                           \n\t"
+    "punpcklbh  $f8, $f8, $f12                            \n\t"
+    "punpcklhw  $f16, $f2, $f10                           \n\t"
+    "punpckhhw  $f18, $f2, $f10                           \n\t"
+    "punpckhhw  $f2, $f0, $f8                             \n\t"
+    "punpcklhw  $f0, $f0, $f8                             \n\t"
+    "punpcklhw  $f20, $f26, $f30                          \n\t"
+    "punpckhhw  $f22, $f26, $f30                          \n\t"
+    "punpckhhw  $f26, $f24, $f28                          \n\t"
+    "punpcklhw  $f24, $f24, $f28                          \n\t"
+    "punpcklwd  $f4, $f2, $f26                            \n\t"
+    "punpckhwd  $f6, $f2, $f26                            \n\t"
+    "punpckhwd  $f2, $f0, $f24                            \n\t"
+    "punpcklwd  $f0, $f0, $f24                            \n\t"
+    "punpcklwd  $f8, $f18, $f22                           \n\t"
+    "punpckhwd  $f10, $f18, $f22                          \n\t"
+    "punpckhwd  $f18, $f16, $f20                          \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "mov.d      $f20, $f2                                 \n\t"
+    "mov.d      $f22, $f18                                \n\t"
+    "mov.d      $f2, $f16                                 \n\t"
+    "mov.d      $f24, $f6                                 \n\t"
+    "mov.d      $f26, $f10                                \n\t"
+    "mov.d      $f6, $f8                                  \n\t"
+    "gssqc1     $f2, $f0, 0x0($11)                        \n\t"
+    "gssqc1     $f22, $f20, 0x10($11)                     \n\t"
+    "gssqc1     $f6, $f4, 0x20($11)                       \n\t"
+    "gssqc1     $f26, $f24, 0x30($11)                     \n\t"
+    "gslqc1     $f26, $f24, 0x80(%[tmp])                  \n\t"
+    "gslqc1     $f18, $f16, 0x90(%[tmp])                  \n\t"
+    "gslqc1     $f22, $f20, 0xa0(%[tmp])                  \n\t"
+    "gslqc1     $f30, $f28, 0xb0(%[tmp])                  \n\t"
+    "xor        $f0, $f0, $f0                             \n\t"
+    "dmtc1      %[iAlpha], $f4                            \n\t"
+    "punpcklhw  $f8, $f4, $f4                             \n\t"
+    "punpcklwd  $f4, $f8, $f8                             \n\t"
+    "mov.d      $f6, $f4                                  \n\t"
+    "dmtc1      %[iBeta], $f8                             \n\t"
+    "punpcklhw  $f12, $f8, $f8                            \n\t"
+    "punpcklwd  $f8, $f12, $f12                           \n\t"
+    "mov.d      $f10, $f8                                 \n\t"
+    "mov.d      $f12, $f24                                \n\t"
+    "punpcklbh  $f24, $f26, $f0                           \n\t"
+    "punpckhbh  $f26, $f26, $f0                           \n\t"
+    "gssqc1     $f26, $f24, 0x60(%[tmp])                  \n\t"
+    "gslqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
+    "punpcklbh  $f24, $f26, $f0                           \n\t"
+    "punpckhbh  $f26, $f26, $f0                           \n\t"
+    "gssqc1     $f26, $f24, 0x30(%[tmp])                  \n\t"
+    "gslqc1     $f26, $f24, 0xa0(%[tmp])                  \n\t"
+    "punpcklbh  $f24, $f26, $f0                           \n\t"
+    "punpckhbh  $f26, $f26, $f0                           \n\t"
+    "gssqc1     $f26, $f24, 0x40(%[tmp])                  \n\t"
+    "gslqc1     $f26, $f24, 0xb0(%[tmp])                  \n\t"
+    "punpcklbh  $f24, $f26, $f0                           \n\t"
+    "punpckhbh  $f26, $f26, $f0                           \n\t"
+    "gssqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
+    "punpckhbh  $f30, $f28, $f0                           \n\t"
+    "punpcklbh  $f28, $f28, $f0                           \n\t"
+    "punpckhbh  $f18, $f16, $f0                           \n\t"
+    "punpcklbh  $f16, $f16, $f0                           \n\t"
+    "punpckhbh  $f22, $f20, $f0                           \n\t"
+    "punpcklbh  $f20, $f20, $f0                           \n\t"
+    "punpckhbh  $f14, $f12, $f0                           \n\t"
+    "punpcklbh  $f12, $f12, $f0                           \n\t"
+    "gssqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
+    "psubh      $f24, $f16, $f20                          \n\t"
+    "psubh      $f26, $f18, $f22                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
+    "pcmpgth    $f0, $f4, $f24                            \n\t"
+    "pcmpgth    $f2, $f6, $f26                            \n\t"
+    "psubh      $f24, $f12, $f16                          \n\t"
+    "psubh      $f26, $f14, $f18                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+    "pcmpgth    $f28, $f8, $f24                           \n\t"
+    "pcmpgth    $f30, $f10, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 0x50(%[tmp])                  \n\t"
+    "psubh      $f24, $f24, $f20                          \n\t"
+    "psubh      $f26, $f26, $f22                          \n\t"
+    "and        $f0, $f0, $f28                            \n\t"
+    "and        $f2, $f2, $f30                            \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
+    "dmfc1      %[iAlpha], $f20                           \n\t"
+    "dmfc1      %[iBeta], $f22                            \n\t"
+    "pcmpgth    $f28, $f8, $f24                           \n\t"
+    "pcmpgth    $f30, $f10, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 0x30(%[tmp])                  \n\t"
+    "gslqc1     $f22, $f20, 0x40(%[tmp])                  \n\t"
+    "psubh      $f24, $f24, $f20                          \n\t"
+    "psubh      $f26, $f26, $f22                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
+    "pcmpgth    $f4, $f4, $f24                            \n\t"
+    "pcmpgth    $f6, $f6, $f26                            \n\t"
+    "gslqc1     $f26, $f24, 0x60(%[tmp])                  \n\t"
+    "gslqc1     $f22, $f20, 0x30(%[tmp])                  \n\t"
+    "psubh      $f24, $f24, $f20                          \n\t"
+    "psubh      $f26, $f26, $f22                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
+    "and        $f0, $f0, $f28                            \n\t"
+    "and        $f2, $f2, $f30                            \n\t"
+    "pcmpgth    $f28, $f8, $f24                           \n\t"
+    "pcmpgth    $f30, $f10, $f26                          \n\t"
+    "gslqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
+    "gslqc1     $f22, $f20, 0x40(%[tmp])                  \n\t"
+    "psubh      $f24, $f24, $f20                          \n\t"
+    "psubh      $f26, $f26, $f22                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
+    "dli        $8, 0x2                                   \n\t"
+    "and        $f4, $f4, $f28                            \n\t"
+    "and        $f6, $f6, $f30                            \n\t"
+    "pcmpgth    $f8, $f8, $f24                            \n\t"
+    "pcmpgth    $f10, $f10, $f26                          \n\t"
+    "and        $f4, $f4, $f8                             \n\t"
+    "and        $f6, $f6, $f10                            \n\t"
+    "dmtc1      $8, $f8                                   \n\t"
+    "punpcklhw  $f24, $f8, $f8                            \n\t"
+    "punpcklwd  $f8, $f24, $f24                           \n\t"
+    "mov.d      $f10, $f8                                 \n\t"
+    "gssqc1     $f10, $f8, 0x20(%[tmp])                   \n\t"
+    "paddh      $f8, $f12, $f12                           \n\t"
+    "paddh      $f10, $f14, $f14                          \n\t"
+    "paddh      $f8, $f8, $f16                            \n\t"
+    "paddh      $f10, $f10, $f18                          \n\t"
+    "gslqc1     $f22, $f20, 0x50(%[tmp])                  \n\t"
+    "paddh      $f8, $f8, $f20                            \n\t"
+    "paddh      $f10, $f10, $f22                          \n\t"
+    "gslqc1     $f26, $f24, 0x20(%[tmp])                  \n\t"
+    "paddh      $f8, $f8, $f24                            \n\t"
+    "paddh      $f10, $f10, $f26                          \n\t"
+    "dmtc1      $8, $f20                                  \n\t"
+    "psrah      $f8, $f8, $f20                            \n\t"
+    "psrah      $f10, $f10, $f20                          \n\t"
+    "and        $f24, $f0, $f8                            \n\t"
+    "and        $f26, $f2, $f10                           \n\t"
+    "pandn      $f8, $f0, $f16                            \n\t"
+    "pandn      $f10, $f2, $f18                           \n\t"
+    "or         $f24, $f24, $f8                           \n\t"
+    "or         $f26, $f26, $f10                          \n\t"
+    "gslqc1     $f10, $f8, 0x60(%[tmp])                   \n\t"
+    "paddh      $f28, $f8, $f8                            \n\t"
+    "paddh      $f30, $f10, $f10                          \n\t"
+    "gslqc1     $f22, $f20, 0x30(%[tmp])                  \n\t"
+    "paddh      $f28, $f28, $f20                          \n\t"
+    "paddh      $f30, $f30, $f22                          \n\t"
+    "gslqc1     $f18, $f16, 0x70(%[tmp])                  \n\t"
+    "paddh      $f28, $f28, $f16                          \n\t"
+    "paddh      $f30, $f30, $f18                          \n\t"
+    "gslqc1     $f10, $f8, 0x20(%[tmp])                   \n\t"
+    "paddh      $f28, $f28, $f8                           \n\t"
+    "paddh      $f30, $f30, $f10                          \n\t"
+    "pandn      $f8, $f4, $f20                            \n\t"
+    "pandn      $f10, $f6, $f22                           \n\t"
+    "dmtc1      $8, $f20                                  \n\t"
+    "psrah      $f28, $f28, $f20                          \n\t"
+    "psrah      $f30, $f30, $f20                          \n\t"
+    "and        $f16, $f4, $f28                           \n\t"
+    "and        $f18, $f6, $f30                           \n\t"
+    "or         $f16, $f16, $f8                           \n\t"
+    "or         $f18, $f18, $f10                          \n\t"
+    "gslqc1     $f10, $f8, 0x50(%[tmp])                   \n\t"
+    "packushb   $f24, $f24, $f26                          \n\t"
+    "packushb   $f26, $f16, $f18                          \n\t"
+    "gssqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
+    "paddh      $f24, $f8, $f8                            \n\t"
+    "paddh      $f26, $f10, $f10                          \n\t"
+    "dmtc1      %[iAlpha], $f20                           \n\t"
+    "dmtc1      %[iBeta], $f22                            \n\t"
+    "gslqc1     $f10, $f8, 0x20(%[tmp])                   \n\t"
+    "paddh      $f24, $f24, $f20                          \n\t"
+    "paddh      $f26, $f26, $f22                          \n\t"
+    "paddh      $f24, $f24, $f12                          \n\t"
+    "paddh      $f26, $f26, $f14                          \n\t"
+    "mov.d      $f16, $f0                                 \n\t"
+    "mov.d      $f18, $f2                                 \n\t"
+    "pandn      $f0, $f0, $f20                            \n\t"
+    "pandn      $f2, $f2, $f22                            \n\t"
+    "dmtc1      $8, $f20                                  \n\t"
+    "paddh      $f24, $f24, $f8                           \n\t"
+    "paddh      $f26, $f26, $f10                          \n\t"
+    "psrah      $f24, $f24, $f20                          \n\t"
+    "psrah      $f26, $f26, $f20                          \n\t"
+    "and        $f16, $f16, $f24                          \n\t"
+    "and        $f18, $f18, $f26                          \n\t"
+    "or         $f16, $f16, $f0                           \n\t"
+    "or         $f18, $f18, $f2                           \n\t"
+    "gslqc1     $f2, $f0, 0x70(%[tmp])                    \n\t"
+    "paddh      $f20, $f0, $f0                            \n\t"
+    "paddh      $f22, $f2, $f2                            \n\t"
+    "gslqc1     $f2, $f0, 0x40(%[tmp])                    \n\t"
+    "paddh      $f20, $f20, $f0                           \n\t"
+    "paddh      $f22, $f22, $f2                           \n\t"
+    "gslqc1     $f14, $f12, 0x60(%[tmp])                  \n\t"
+    "paddh      $f20, $f20, $f12                          \n\t"
+    "paddh      $f22, $f22, $f14                          \n\t"
+    "paddh      $f20, $f20, $f8                           \n\t"
+    "paddh      $f22, $f22, $f10                          \n\t"
+    "dmtc1      $8, $f8                                   \n\t"
+    "psrah      $f20, $f20, $f8                           \n\t"
+    "psrah      $f22, $f22, $f8                           \n\t"
+    "and        $f12, $f4, $f20                           \n\t"
+    "and        $f14, $f6, $f22                           \n\t"
+    "pandn      $f4, $f4, $f0                             \n\t"
+    "pandn      $f6, $f6, $f2                             \n\t"
+    "or         $f12, $f12, $f4                           \n\t"
+    "or         $f14, $f14, $f6                           \n\t"
+    "packushb   $f16, $f16, $f18                          \n\t"
+    "packushb   $f18, $f12, $f14                          \n\t"
+    "gssqc1     $f18, $f16, 0xa0(%[tmp])                  \n\t"
+    "gslqc1     $f2, $f0, 0x0($11)                        \n\t"
+    "gslqc1     $f6, $f4, 0x10($11)                       \n\t"
+    "gslqc1     $f10, $f8, 0x20($11)                      \n\t"
+    "gslqc1     $f14, $f12, 0x30($11)                     \n\t"
+    "mov.d      $f26, $f2                                 \n\t"
+    "punpckhbh  $f2, $f0, $f4                             \n\t"
+    "punpcklbh  $f0, $f0, $f4                             \n\t"
+    "punpcklbh  $f24, $f26, $f6                           \n\t"
+    "punpckhbh  $f26, $f26, $f6                           \n\t"
+    "mov.d      $f30, $f10                                \n\t"
+    "punpckhbh  $f10, $f8, $f12                           \n\t"
+    "punpcklbh  $f8, $f8, $f12                            \n\t"
+    "punpcklbh  $f28, $f30, $f14                          \n\t"
+    "punpckhbh  $f30, $f30, $f14                          \n\t"
+    "punpcklhw  $f16, $f2, $f10                           \n\t"
+    "punpckhhw  $f18, $f2, $f10                           \n\t"
+    "punpcklhw  $f20, $f26, $f30                          \n\t"
+    "punpckhhw  $f22, $f26, $f30                          \n\t"
+    "punpckhhw  $f2, $f0, $f8                             \n\t"
+    "punpcklhw  $f0, $f0, $f8                             \n\t"
+    "punpckhhw  $f26, $f24, $f28                          \n\t"
+    "punpcklhw  $f24, $f24, $f28                          \n\t"
+    "punpcklwd  $f4, $f2, $f26                            \n\t"
+    "punpckhwd  $f6, $f2, $f26                            \n\t"
+    "punpcklwd  $f8, $f18, $f22                           \n\t"
+    "punpckhwd  $f10, $f18, $f22                          \n\t"
+    "punpckhwd  $f2, $f0, $f24                            \n\t"
+    "punpcklwd  $f0, $f0, $f24                            \n\t"
+    "punpckhwd  $f18, $f16, $f20                          \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "mov.d      $f20, $f2                                 \n\t"
+    "mov.d      $f24, $f6                                 \n\t"
+    "mov.d      $f2, $f16                                 \n\t"
+    "mov.d      $f22, $f18                                \n\t"
+    "mov.d      $f6, $f8                                  \n\t"
+    "mov.d      $f26, $f10                                \n\t"
+    "dli        %[iAlpha], 0x20                           \n\t"
+    "dmtc1      %[iAlpha], $f8                            \n\t"
+    "gsswlc1    $f0, 0x3($9)                              \n\t"
+    "gsswrc1    $f0, 0x0($9)                              \n\t"
+    "daddu      $12, $9, %[iStride]                       \n\t"
+    "gsswlc1    $f20, 0x3($12)                            \n\t"
+    "gsswrc1    $f20, 0x0($12)                            \n\t"
+    "daddu      $12, $12, %[iStride]                      \n\t"
+    "gsswlc1    $f4, 0x3($12)                             \n\t"
+    "gsswrc1    $f4, 0x0($12)                             \n\t"
+    "daddu      $12, $12, %[iStride]                      \n\t"
+    "gsswlc1    $f24, 0x3($12)                            \n\t"
+    "gsswrc1    $f24, 0x0($12)                            \n\t"
+    "dsrl       $f0, $f0, $f8                             \n\t"
+    "dsrl       $f20, $f20, $f8                           \n\t"
+    "dsrl       $f4, $f4, $f8                             \n\t"
+    "dsrl       $f24, $f24, $f8                           \n\t"
+    "gsswlc1    $f0, 0x3($10)                             \n\t"
+    "gsswrc1    $f0, 0x0($10)                             \n\t"
+    "daddu      $13, $10, %[iStride]                      \n\t"
+    "daddu      $8, $13, %[iStride]                       \n\t"
+    "gsswlc1    $f20, 0x3($13)                            \n\t"
+    "gsswrc1    $f20, 0x0($13)                            \n\t"
+    "daddu      $13, $8, %[iStride]                       \n\t"
+    "gsswlc1    $f4, 0x3($8)                              \n\t"
+    "gsswrc1    $f4, 0x0($8)                              \n\t"
+    "gsswlc1    $f24, 0x3($13)                            \n\t"
+    "gsswrc1    $f24, 0x0($13)                            \n\t"
+    "gsswlc1    $f2, 0x3(%[pPixCb])                       \n\t"
+    "gsswrc1    $f2, 0x0(%[pPixCb])                       \n\t"
+    "daddu      $12, %[pPixCb], %[iStride]                \n\t"
+    "gsswlc1    $f22, 0x3($12)                            \n\t"
+    "gsswrc1    $f22, 0x0($12)                            \n\t"
+    "daddu      $12, $12, %[iStride]                      \n\t"
+    "gsswlc1    $f6, 0x3($12)                             \n\t"
+    "gsswrc1    $f6, 0x0($12)                             \n\t"
+    "daddu      $12, $12, %[iStride]                      \n\t"
+    "gsswlc1    $f26, 0x3($12)                            \n\t"
+    "gsswrc1    $f26, 0x0($12)                            \n\t"
+    "dsrl       $f2, $f2, $f8                             \n\t"
+    "dsrl       $f22, $f22, $f8                           \n\t"
+    "dsrl       $f6, $f6, $f8                             \n\t"
+    "dsrl       $f26, $f26, $f8                           \n\t"
+    "gsswlc1    $f2, 0x3(%[pPixCr])                       \n\t"
+    "gsswrc1    $f2, 0x0(%[pPixCr])                       \n\t"
+    "daddu      $13, %[pPixCr], %[iStride]                \n\t"
+    "daddu      $8, $13, %[iStride]                       \n\t"
+    "gsswlc1    $f22, 0x3($13)                            \n\t"
+    "gsswrc1    $f22, 0x0($13)                            \n\t"
+    "daddu      $13, $8, %[iStride]                       \n\t"
+    "gsswlc1    $f6, 0x3($8)                              \n\t"
+    "gsswrc1    $f6, 0x0($8)                              \n\t"
+    "gsswlc1    $f26, 0x3($13)                            \n\t"
+    "gsswrc1    $f26, 0x0($13)                            \n\t"
+    : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
+    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
+      [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
+      "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
+      "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void DeblockChromaLt4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
+                           int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
+  unsigned char tmp[320] __attribute__((aligned(32)));
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "daddiu     %[pPixCb], %[pPixCb], -0x2                \n\t"
+    "daddiu     %[pPixCr], %[pPixCr], -0x2                \n\t"
+    "daddu      $8, %[pPixCb], %[iStride]                 \n\t"
+    "gsldlc1    $f0, 0x7(%[pPixCb])                       \n\t"
+    "gsldlc1    $f4, 0x7($8)                              \n\t"
+    "gsldrc1    $f0, 0x0(%[pPixCb])                       \n\t"
+    "gsldrc1    $f4, 0x0($8)                              \n\t"
+    "daddu      $9, $8, %[iStride]                        \n\t"
+    "daddu      $8, $9, %[iStride]                        \n\t"
+    "gsldlc1    $f8, 0x7($9)                              \n\t"
+    "gsldlc1    $f12, 0x7($8)                             \n\t"
+    "gsldrc1    $f8, 0x0($9)                              \n\t"
+    "gsldrc1    $f12, 0x0($8)                             \n\t"
+    "daddu      $9, $8, %[iStride]                        \n\t"
+
+    "daddu      $10, %[pPixCr], %[iStride]                \n\t"
+    "gsldlc1    $f16, 0x7(%[pPixCr])                      \n\t"
+    "gsldlc1    $f20, 0x7($10)                            \n\t"
+    "gsldrc1    $f16, 0x0(%[pPixCr])                      \n\t"
+    "gsldrc1    $f20, 0x0($10)                            \n\t"
+    "daddu      $11, $10, %[iStride]                      \n\t"
+    "daddu      $10, $11, %[iStride]                      \n\t"
+    "gsldlc1    $f24, 0x7($11)                            \n\t"
+    "gsldlc1    $f28, 0x7($10)                            \n\t"
+    "gsldrc1    $f24, 0x0($11)                            \n\t"
+    "gsldrc1    $f28, 0x0($10)                            \n\t"
+    "daddu      $11, $10, %[iStride]                      \n\t"
+
+    "punpcklwd  $f0, $f0, $f16                            \n\t"
+    "punpcklwd  $f4, $f4, $f20                            \n\t"
+    "punpcklwd  $f8, $f8, $f24                            \n\t"
+    "punpcklwd  $f12, $f12, $f28                          \n\t"
+    "gsldlc1    $f16, 0x7($9)                             \n\t"
+    "gsldlc1    $f20, 0x7($11)                            \n\t"
+    "gsldrc1    $f16, 0x0($9)                             \n\t"
+    "gsldrc1    $f20, 0x0($11)                            \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "mov.d      $f2, $f16                                 \n\t"
+    "daddu      $8, $9, %[iStride]                        \n\t"
+    "daddu      $10, $11, %[iStride]                      \n\t"
+    "gsldlc1    $f16, 0x7($8)                             \n\t"
+    "gsldlc1    $f20, 0x7($10)                            \n\t"
+    "gsldrc1    $f16, 0x0($8)                             \n\t"
+    "gsldrc1    $f20, 0x0($10)                            \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "mov.d      $f6, $f16                                 \n\t"
+    "daddu      $9, $8, %[iStride]                        \n\t"
+    "daddu      $11, $10, %[iStride]                      \n\t"
+
+    "gsldlc1    $f16, 0x7($9)                             \n\t"
+    "gsldlc1    $f20, 0x7($11)                            \n\t"
+    "gsldrc1    $f16, 0x0($9)                             \n\t"
+    "gsldrc1    $f20, 0x0($11)                            \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "mov.d      $f10, $f16                                \n\t"
+    "daddu      $8, $9, %[iStride]                        \n\t"
+    "daddu      $10, $11, %[iStride]                      \n\t"
+
+    "gsldlc1    $f16, 0x7($8)                             \n\t"
+    "gsldlc1    $f20, 0x7($10)                            \n\t"
+    "gsldrc1    $f16, 0x0($8)                             \n\t"
+    "gsldrc1    $f20, 0x0($10)                            \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "mov.d      $f14, $f16                                \n\t"
+
+    "punpcklbh  $f24, $f2, $f6                            \n\t"
+    "punpckhbh  $f26, $f2, $f6                            \n\t"
+    "punpckhbh  $f2, $f0, $f4                             \n\t"
+    "punpcklbh  $f0, $f0, $f4                             \n\t"
+    "punpcklbh  $f28, $f10, $f14                          \n\t"
+    "punpckhbh  $f30, $f10, $f14                          \n\t"
+    "punpckhbh  $f10, $f8, $f12                           \n\t"
+    "punpcklbh  $f8, $f8, $f12                            \n\t"
+
+    "punpcklhw  $f16, $f2, $f10                           \n\t"
+    "punpckhhw  $f18, $f2, $f10                           \n\t"
+    "punpckhhw  $f2, $f0, $f8                             \n\t"
+    "punpcklhw  $f0, $f0, $f8                             \n\t"
+    "punpcklhw  $f20, $f26, $f30                          \n\t"
+    "punpckhhw  $f22, $f26, $f30                          \n\t"
+    "punpckhhw  $f26, $f24, $f28                          \n\t"
+    "punpcklhw  $f24, $f24, $f28                          \n\t"
+
+    "punpcklwd  $f4, $f2, $f26                            \n\t"
+    "punpckhwd  $f6, $f2, $f26                            \n\t"
+    "punpckhwd  $f2, $f0, $f24                            \n\t"
+    "punpcklwd  $f0, $f0, $f24                            \n\t"
+    "punpcklwd  $f8, $f18, $f22                           \n\t"
+    "punpckhwd  $f10, $f18, $f22                          \n\t"
+    "punpckhwd  $f18, $f16, $f20                          \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+
+    "mov.d      $f20, $f2                                 \n\t"
+    "mov.d      $f22, $f18                                \n\t"
+    "mov.d      $f2, $f16                                 \n\t"
+    "mov.d      $f24, $f6                                 \n\t"
+    "mov.d      $f26, $f10                                \n\t"
+    "mov.d      $f6, $f8                                  \n\t"
+    "daddiu     $11, %[tmp], 0x70                         \n\t"
+
+    "gssqc1     $f2, $f0, 0x0($11)                        \n\t"
+    "gssqc1     $f22, $f20, 0x10($11)                     \n\t"
+    "gssqc1     $f6, $f4, 0x20($11)                       \n\t"
+    "gssqc1     $f26, $f24, 0x30($11)                     \n\t"
+
+    "lb         $8, 0x3(%[pTC])                           \n\t"
+    "lb         $9, 0x2(%[pTC])                           \n\t"
+    "lb         $10, 0x1(%[pTC])                          \n\t"
+    "lb         $11, 0x0(%[pTC])                          \n\t"
+
+    "and        $12, $8, 0xFFFF                           \n\t"
+    "dmtc1      $12, $f8                                  \n\t"
+
+    "and        $9, $9, 0xFFFF                            \n\t"
+    "dmtc1      $9, $f12                                  \n\t"
+    "mov.d      $f16, $f12                                \n\t"
+
+    "and        $9, $10, 0xFFFF                           \n\t"
+    "dmtc1      $9, $f20                                  \n\t"
+    "xor        $f0, $f0, $f0                             \n\t"
+    "mov.d      $f24, $f20                                \n\t"
+    "and        $9, $11, 0xFFFF                           \n\t"
+    "punpcklhw  $f24, $f24, $f8                           \n\t"
+
+    "mov.d      $f4, $f8                                  \n\t"
+    "dmtc1      $9, $f28                                  \n\t"
+    "mov.d      $f0, $f28                                 \n\t"
+
+    "punpcklhw  $f28, $f28, $f12                          \n\t"
+    "punpcklhw  $f20, $f20, $f4                           \n\t"
+    "xor        $f4, $f4, $f4                             \n\t"
+    "xor        $f6, $f6, $f6                             \n\t"
+    "punpcklhw  $f28, $f28, $f20                          \n\t"
+    "gslqc1     $f22, $f20, 0xA0(%[tmp])                  \n\t"
+    "punpcklhw  $f0, $f0, $f16                            \n\t"
+    "punpcklhw  $f0, $f0, $f24                            \n\t"
+
+    "gslqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
+    "punpckhhw  $f2, $f0, $f28                            \n\t"
+    "punpcklhw  $f0, $f0, $f28                            \n\t"
+    "gslqc1     $f30, $f28, 0x80(%[tmp])                  \n\t"
+    "psubh      $f8, $f4, $f0                             \n\t"
+    "psubh      $f10, $f6, $f2                            \n\t"
+    "gssqc1     $f10, $f8, 0xD0(%[tmp])                   \n\t"
+    "dmtc1      %[iAlpha], $f8                            \n\t"
+    "punpcklhw  $f12, $f8, $f8                            \n\t"
+    "punpcklwd  $f16, $f12, $f12                          \n\t"
+    "mov.d      $f18, $f16                                \n\t"
+
+    "dmtc1      %[iBeta], $f8                             \n\t"
+    "punpcklhw  $f12, $f8, $f8                            \n\t"
+    "punpcklwd  $f8, $f12, $f12                           \n\t"
+    "mov.d      $f10, $f8                                 \n\t"
+
+    "gslqc1     $f14, $f12, 0x90(%[tmp])                  \n\t"
+    "gssqc1     $f10, $f8, 0x50(%[tmp])                   \n\t"
+    "punpckhbh  $f10, $f24, $f4                           \n\t"
+    "punpcklbh  $f8, $f24, $f4                            \n\t"
+    "punpcklbh  $f24, $f26, $f6                           \n\t"
+    "punpckhbh  $f26, $f26, $f6                           \n\t"
+
+    "gssqc1     $f10, $f8, 0x40(%[tmp])                   \n\t"
+    "gssqc1     $f26, $f24, 0xB0(%[tmp])                  \n\t"
+    "gslqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
+    "punpcklbh  $f8, $f28, $f4                            \n\t"
+    "punpckhbh  $f10, $f28, $f4                           \n\t"
+    "punpcklbh  $f28, $f30, $f6                           \n\t"
+    "punpckhbh  $f30, $f30, $f6                           \n\t"
+    "punpcklbh  $f24, $f26, $f6                           \n\t"
+    "punpckhbh  $f26, $f26, $f6                           \n\t"
+    "punpckhbh  $f14, $f12, $f4                           \n\t"
+    "punpcklbh  $f12, $f12, $f4                           \n\t"
+    "punpckhbh  $f22, $f20, $f4                           \n\t"
+    "punpcklbh  $f20, $f20, $f4                           \n\t"
+    "gssqc1     $f30, $f28, 0xF0(%[tmp])                  \n\t"
+    "gssqc1     $f26, $f24, 0xC0(%[tmp])                  \n\t"
+    "gslqc1     $f26, $f24, 0xA0(%[tmp])                  \n\t"
+    "punpcklbh  $f24, $f26, $f6                           \n\t"
+    "punpckhbh  $f26, $f26, $f6                           \n\t"
+
+    "dli        $13, 0x4                                  \n\t"
+    "gssqc1     $f26, $f24, 0xE0(%[tmp])                  \n\t"
+    "dmtc1      $13, $f24                                 \n\t"
+    "punpcklhw  $f28, $f24, $f24                          \n\t"
+    "punpcklwd  $f24, $f28, $f28                          \n\t"
+    "mov.d      $f26, $f24                                \n\t"
+    "dli        $12, 0x2                                  \n\t"
+    "dli        $13, 0x3                                  \n\t"
+
+    "gssqc1     $f2, $f0, 0x20(%[tmp])                    \n\t"
+    "dmfc1      %[iAlpha], $f0                            \n\t"
+    "dmfc1      %[iBeta], $f2                             \n\t"
+    "gssqc1     $f26, $f24, 0x30(%[tmp])                  \n\t"
+    "gslqc1     $f30, $f28, 0x40(%[tmp])                  \n\t"
+    "psubh      $f28, $f28, $f20                          \n\t"
+    "psubh      $f30, $f30, $f22                          \n\t"
+    "pcmpgth    $f24, $f0, $f4                            \n\t"
+    "pcmpgth    $f26, $f2, $f6                            \n\t"
+
+    "dmtc1      $12, $f0                                  \n\t"
+    "dmtc1      $13, $f2                                  \n\t"
+    "gssqc1     $f26, $f24, 0x60(%[tmp])                  \n\t"
+    "gslqc1     $f6, $f4, 0xD0(%[tmp])                    \n\t"
+    "psubh      $f24, $f12, $f8                           \n\t"
+    "psubh      $f26, $f14, $f10                          \n\t"
+    "psllh      $f24, $f24, $f0                           \n\t"
+    "psllh      $f26, $f26, $f0                           \n\t"
+    "paddh      $f24, $f24, $f28                          \n\t"
+    "paddh      $f26, $f26, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 0x30(%[tmp])                  \n\t"
+    "paddh      $f24, $f24, $f28                          \n\t"
+    "paddh      $f26, $f26, $f30                          \n\t"
+    "psrah      $f24, $f24, $f2                           \n\t"
+    "psrah      $f26, $f26, $f2                           \n\t"
+    "pmaxsh     $f4, $f4, $f24                            \n\t"
+    "pmaxsh     $f6, $f6, $f26                            \n\t"
+
+    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
+    "gslqc1     $f26, $f24, 0x20(%[tmp])                  \n\t"
+    "pminsh     $f24, $f24, $f4                           \n\t"
+    "pminsh     $f26, $f26, $f6                           \n\t"
+
+    "gssqc1     $f26, $f24, 0x20(%[tmp])                  \n\t"
+    "psubh      $f4, $f8, $f12                            \n\t"
+    "psubh      $f6, $f10, $f14                           \n\t"
+    WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
+    "pcmpgth    $f24, $f16, $f4                           \n\t"
+    "pcmpgth    $f26, $f18, $f6                           \n\t"
+    "gslqc1     $f6, $f4, 0x40(%[tmp])                    \n\t"
+    "psubh      $f4, $f4, $f8                             \n\t"
+    "psubh      $f6, $f6, $f10                            \n\t"
+    WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
+    "pcmpgth    $f28, $f28, $f4                           \n\t"
+    "pcmpgth    $f30, $f30, $f6                           \n\t"
+
+    "gslqc1     $f6, $f4, 0x50(%[tmp])                    \n\t"
+    "and        $f24, $f24, $f28                          \n\t"
+    "and        $f26, $f26, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
+    "psubh      $f20, $f20, $f12                          \n\t"
+    "psubh      $f22, $f22, $f14                          \n\t"
+    WELS_AbsH($f20, $f22, $f20, $f22, $f0, $f2)
+    "pcmpgth    $f4, $f4, $f20                            \n\t"
+    "pcmpgth    $f6, $f6, $f22                            \n\t"
+
+    "gslqc1     $f22, $f20, 0xB0(%[tmp])                  \n\t"
+    "gslqc1     $f2, $f0, 0xE0(%[tmp])                    \n\t"
+    "psubh      $f20, $f20, $f0                           \n\t"
+    "psubh      $f22, $f22, $f2                           \n\t"
+    "and        $f24, $f24, $f4                           \n\t"
+    "and        $f26, $f26, $f6                           \n\t"
+    "gslqc1     $f2, $f0, 0x60(%[tmp])                    \n\t"
+    "and        $f24, $f24, $f0                           \n\t"
+    "and        $f26, $f26, $f2                           \n\t"
+
+    "gslqc1     $f6, $f4, 0x20(%[tmp])                    \n\t"
+    "and        $f4, $f4, $f24                            \n\t"
+    "and        $f6, $f6, $f26                            \n\t"
+    "gslqc1     $f26, $f24, 0xC0(%[tmp])                  \n\t"
+    "gssqc1     $f6, $f4, 0x40(%[tmp])                    \n\t"
+    "gslqc1     $f6, $f4, 0xF0(%[tmp])                    \n\t"
+
+    "dmtc1      $12, $f0                                  \n\t"
+    "psubh      $f24, $f24, $f4                           \n\t"
+    "psubh      $f26, $f26, $f6                           \n\t"
+    "psllh      $f24, $f24, $f0                           \n\t"
+    "psllh      $f26, $f26, $f0                           \n\t"
+    "paddh      $f24, $f24, $f20                          \n\t"
+    "paddh      $f26, $f26, $f22                          \n\t"
+    "gslqc1     $f2, $f0, 0x30(%[tmp])                    \n\t"
+    "paddh      $f24, $f24, $f0                           \n\t"
+    "paddh      $f26, $f26, $f2                           \n\t"
+    "dmtc1      %[iBeta], $f2                             \n\t"
+
+    "dmtc1      $13, $f0                                  \n\t"
+    "gslqc1     $f22, $f20, 0xD0(%[tmp])                  \n\t"
+    "psrah      $f24, $f24, $f0                           \n\t"
+    "psrah      $f26, $f26, $f0                           \n\t"
+    "dmtc1      %[iAlpha], $f0                            \n\t"
+    "pmaxsh     $f20, $f20, $f24                          \n\t"
+    "pmaxsh     $f22, $f22, $f26                          \n\t"
+    "pminsh     $f0, $f0, $f20                            \n\t"
+    "pminsh     $f2, $f2, $f22                            \n\t"
+
+    "dmfc1      %[iAlpha], $f0                            \n\t"
+    "dmfc1      %[iBeta], $f2                             \n\t"
+    "gslqc1     $f22, $f20, 0xC0(%[tmp])                  \n\t"
+    "psubh      $f24, $f4, $f20                           \n\t"
+    "psubh      $f26, $f6, $f22                           \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
+    "pcmpgth    $f16, $f16, $f24                          \n\t"
+    "pcmpgth    $f18, $f18, $f26                          \n\t"
+
+    "gslqc1     $f26, $f24, 0xB0(%[tmp])                  \n\t"
+    "psubh      $f24, $f24, $f4                           \n\t"
+    "psubh      $f26, $f26, $f6                           \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
+    "pcmpgth    $f28, $f28, $f24                          \n\t"
+    "pcmpgth    $f30, $f30, $f26                          \n\t"
+
+    "gslqc1     $f26, $f24, 0xE0(%[tmp])                  \n\t"
+    "and        $f16, $f16, $f28                          \n\t"
+    "and        $f18, $f18, $f30                          \n\t"
+
+    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
+    "psubh      $f24, $f24, $f20                          \n\t"
+    "psubh      $f26, $f26, $f22                          \n\t"
+    WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
+    "pcmpgth    $f28, $f28, $f24                          \n\t"
+    "pcmpgth    $f30, $f30, $f26                          \n\t"
+    "and        $f16, $f16, $f28                          \n\t"
+    "and        $f18, $f18, $f30                          \n\t"
+    "gslqc1     $f30, $f28, 0x60(%[tmp])                  \n\t"
+    "dmtc1      %[iAlpha], $f0                            \n\t"
+    "dmtc1      %[iBeta], $f2                             \n\t"
+    "and        $f16, $f16, $f28                          \n\t"
+    "and        $f18, $f18, $f30                          \n\t"
+    "and        $f0, $f0, $f16                            \n\t"
+    "and        $f2, $f2, $f18                            \n\t"
+
+    "gslqc1     $f18, $f16, 0x40(%[tmp])                  \n\t"
+    "paddh      $f8, $f8, $f16                            \n\t"
+    "paddh      $f10, $f10, $f18                          \n\t"
+    "paddh      $f4, $f4, $f0                             \n\t"
+    "paddh      $f6, $f6, $f2                             \n\t"
+    "psubh      $f12, $f12, $f16                          \n\t"
+    "psubh      $f14, $f14, $f18                          \n\t"
+    "psubh      $f20, $f20, $f0                           \n\t"
+    "psubh      $f22, $f22, $f2                           \n\t"
+    "packushb   $f8, $f8, $f10                            \n\t"
+    "packushb   $f10, $f4, $f6                            \n\t"
+    "packushb   $f12, $f12, $f14                          \n\t"
+    "packushb   $f14, $f20, $f22                          \n\t"
+
+    "gssqc1     $f10, $f8, 0x80(%[tmp])                   \n\t"
+    "gssqc1     $f14, $f12, 0x90(%[tmp])                  \n\t"
+    "daddiu     $11, %[tmp], 0x70                         \n\t"
+
+    "gslqc1     $f2, $f0, 0x0($11)                        \n\t"
+    "gslqc1     $f6, $f4, 0x10($11)                       \n\t"
+    "gslqc1     $f10, $f8, 0x20($11)                      \n\t"
+    "gslqc1     $f14, $f12, 0x30($11)                     \n\t"
+
+    "punpcklbh  $f24, $f2, $f6                            \n\t"
+    "punpckhbh  $f26, $f2, $f6                            \n\t"
+    "punpckhbh  $f2, $f0, $f4                             \n\t"
+    "punpcklbh  $f0, $f0, $f4                             \n\t"
+
+    "punpcklbh  $f28, $f10, $f14                          \n\t"
+    "punpckhbh  $f30, $f10, $f14                          \n\t"
+    "punpckhbh  $f10, $f8, $f12                           \n\t"
+    "punpcklbh  $f8, $f8, $f12                            \n\t"
+
+    "punpcklhw  $f16, $f2, $f10                           \n\t"
+    "punpckhhw  $f18, $f2, $f10                           \n\t"
+    "punpckhhw  $f2, $f0, $f8                             \n\t"
+    "punpcklhw  $f0, $f0, $f8                             \n\t"
+    "punpcklhw  $f20, $f26, $f30                          \n\t"
+    "punpckhhw  $f22, $f26, $f30                          \n\t"
+    "punpckhhw  $f26, $f24, $f28                          \n\t"
+    "punpcklhw  $f24, $f24, $f28                          \n\t"
+
+    "punpcklwd  $f4, $f2, $f26                            \n\t"
+    "punpckhwd  $f6, $f2, $f26                            \n\t"
+    "punpckhwd  $f2, $f0, $f24                            \n\t"
+    "punpcklwd  $f0, $f0, $f24                            \n\t"
+    "punpcklwd  $f8, $f18, $f22                           \n\t"
+    "punpckhwd  $f10, $f18, $f22                          \n\t"
+    "punpckhwd  $f18, $f16, $f20                          \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+
+    "mov.d      $f20, $f2                                 \n\t"
+    "mov.d      $f22, $f18                                \n\t"
+    "mov.d      $f2, $f16                                 \n\t"
+    "mov.d      $f24, $f6                                 \n\t"
+    "mov.d      $f26, $f10                                \n\t"
+    "mov.d      $f6, $f8                                  \n\t"
+
+    "dli        %[iAlpha], 0x20                           \n\t"
+    "daddu      $8, %[pPixCb], %[iStride]                 \n\t"
+    "gsswlc1    $f0, 0x3(%[pPixCb])                       \n\t"
+    "gsswlc1    $f20, 0x3($8)                             \n\t"
+    "gsswrc1    $f0, 0x0(%[pPixCb])                       \n\t"
+    "gsswrc1    $f20, 0x0($8)                             \n\t"
+    "daddu      $9, $8, %[iStride]                        \n\t"
+    "daddu      $8, $9, %[iStride]                        \n\t"
+    "gsswlc1    $f4, 0x3($9)                              \n\t"
+    "gsswlc1    $f24, 0x3($8)                             \n\t"
+    "gsswrc1    $f4, 0x0($9)                              \n\t"
+    "gsswrc1    $f24, 0x0($8)                             \n\t"
+    "daddu      $9, $8, %[iStride]                        \n\t"
+    "dmtc1      %[iAlpha], $f8                            \n\t"
+
+    "dsrl       $f0, $f0, $f8                             \n\t"
+    "dsrl       $f20, $f20, $f8                           \n\t"
+    "dsrl       $f4, $f4, $f8                             \n\t"
+    "dsrl       $f24, $f24, $f8                           \n\t"
+    "daddu      $10, %[pPixCr], %[iStride]                \n\t"
+    "gsswlc1    $f0, 0x3(%[pPixCr])                       \n\t"
+    "gsswlc1    $f20, 0x3($10)                            \n\t"
+    "gsswrc1    $f0, 0x0(%[pPixCr])                       \n\t"
+    "gsswrc1    $f20, 0x0($10)                            \n\t"
+    "daddu      $11, $10, %[iStride]                      \n\t"
+    "daddu      $10, $11, %[iStride]                      \n\t"
+    "gsswlc1    $f4, 0x3($11)                             \n\t"
+    "gsswlc1    $f24, 0x3($10)                            \n\t"
+    "gsswrc1    $f4, 0x0($11)                             \n\t"
+    "gsswrc1    $f24, 0x0($10)                            \n\t"
+    "daddu      $11, $10, %[iStride]                      \n\t"
+
+    "daddu      $8, $9, %[iStride]                        \n\t"
+    "gsswlc1    $f2, 0x3($9)                              \n\t"
+    "gsswlc1    $f22, 0x3($8)                             \n\t"
+    "gsswrc1    $f2, 0x0($9)                              \n\t"
+    "gsswrc1    $f22, 0x0($8)                             \n\t"
+    "daddu      $9, $8, %[iStride]                        \n\t"
+    "daddu      $8, $9, %[iStride]                        \n\t"
+    "gsswlc1    $f6, 0x3($9)                              \n\t"
+    "gsswlc1    $f26, 0x3($8)                             \n\t"
+    "gsswrc1    $f6, 0x0($9)                              \n\t"
+    "gsswrc1    $f26, 0x0($8)                             \n\t"
+
+    "dsrl       $f2, $f2, $f8                             \n\t"
+    "dsrl       $f22, $f22, $f8                           \n\t"
+    "dsrl       $f6, $f6, $f8                             \n\t"
+    "dsrl       $f26, $f26, $f8                           \n\t"
+    "daddu      $10, $11, %[iStride]                      \n\t"
+    "gsswlc1    $f2, 0x3($11)                             \n\t"
+    "gsswlc1    $f22, 0x3($10)                            \n\t"
+    "gsswrc1    $f2, 0x0($11)                             \n\t"
+    "gsswrc1    $f22, 0x0($10)                            \n\t"
+    "daddu      $11, $10, %[iStride]                      \n\t"
+    "daddu      $10, $11, %[iStride]                      \n\t"
+    "gsswlc1    $f6, 0x3($11)                             \n\t"
+    "gsswlc1    $f26, 0x3($10)                            \n\t"
+    "gsswrc1    $f6, 0x0($11)                             \n\t"
+    "gsswrc1    $f26, 0x0($10)                            \n\t"
+    : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
+    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
+      [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp), [pTC]"r"((char *)pTC)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
+      "$f6", "$f8", "$f10", "$f12","$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
+      "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void WelsNonZeroCount_mmi(int8_t *pNonZeroCount) {
+  __asm__ volatile(
+    ".set       arch=loongson3a                 \n\t"
+    "gsldlc1    $f0, 0x7(%[pNonZeroCount])      \n\t"
+    "gsldlc1    $f2, 0xF(%[pNonZeroCount])      \n\t"
+    "gsldlc1    $f4, 0x17(%[pNonZeroCount])     \n\t"
+    "gsldrc1    $f4, 0x10(%[pNonZeroCount])     \n\t"
+    "gsldrc1    $f0, 0x0(%[pNonZeroCount])      \n\t"
+    "gsldrc1    $f2, 0x8(%[pNonZeroCount])      \n\t"
+    "pcmpeqh    $f8, $f8, $f8                   \n\t"
+    "dli        $8, 0xF                         \n\t"
+    "dmtc1      $8, $f6                         \n\t"
+    "psrlh      $f8, $f8, $f6                   \n\t"
+    "packushb   $f8, $f8, $f8                   \n\t"
+
+    "pminub     $f0, $f0, $f8                   \n\t"
+    "pminub     $f2, $f2, $f8                   \n\t"
+    "pminub     $f4, $f4, $f8                   \n\t"
+    "gssdlc1    $f0, 0x7(%[pNonZeroCount])      \n\t"
+    "gssdlc1    $f2, 0xF(%[pNonZeroCount])      \n\t"
+    "gssdlc1    $f4, 0x17(%[pNonZeroCount])     \n\t"
+    "gssdrc1    $f0, 0x0(%[pNonZeroCount])      \n\t"
+    "gssdrc1    $f2, 0x8(%[pNonZeroCount])      \n\t"
+    "gssdrc1    $f4, 0x10(%[pNonZeroCount])     \n\t"
+    :
+    : [pNonZeroCount] "r"((unsigned char *)pNonZeroCount)
+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8"
+  );
+}
--- a/codec/common/mips64/deblock_mmi.c
+++ /dev/null
@@ -1,2826 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2018, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file    deblock_mmi.c
- *
- * \brief   Loongson optimize
- *
- * \date    20/07/2018 Created
- *
- *************************************************************************************
- */
-#include <stdint.h>
-#include "asmdefs_mmi.h"
-
-void DeblockLumaLt4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
-                         int32_t iBeta, int8_t *pTC) {
-  unsigned char tmp[512] __attribute__((aligned(32)));
-  BACKUP_REG;
-  __asm__ volatile (
-    ".set       arch=loongson3a                           \n\t"
-    "dsll       $8, %[iStride], 0x1                       \n\t"
-    "daddu      $8, $8, %[iStride]                        \n\t"
-    "dsubu      $14, %[pPix], $8                          \n\t"
-
-    "dsll       $8, %[iStride], 0x1                       \n\t"
-    "dsubu      $9, %[pPix], $8                           \n\t"
-
-    "dmtc1      %[iAlpha], $f0                            \n\t"
-    "dsubu      $13, %[pPix], %[iStride]                  \n\t"
-    "daddu      %[iStride], %[iStride], %[pPix]           \n\t"
-    "daddu      $12, $8, %[pPix]                          \n\t"
-
-    "punpcklhw  $f0, $f0, $f0                             \n\t"
-    "lb         $8, 0x0(%[pTC])                           \n\t"
-    "punpcklwd  $f0, $f0, $f0                             \n\t"
-    "mov.d      $f2, $f0                                  \n\t"
-    "gssqc1     $f2, $f0, 432-112(%[tmp])                 \n\t"
-    "dmtc1      %[iBeta], $f0                             \n\t"
-    "lb         %[iAlpha], 0x1(%[pTC])                    \n\t"
-    "dli        %[iBeta], 0xFFFF                          \n\t"
-    "punpcklhw  $f0, $f0, $f0                             \n\t"
-    "and        $10, %[iAlpha], %[iBeta]                  \n\t"
-    "punpcklwd  $f0, $f0, $f0                             \n\t"
-    "mov.d      $f2, $f0                                  \n\t"
-    "and        %[iAlpha], %[iAlpha], %[iBeta]            \n\t"
-    "dmtc1      $10, $f4                                  \n\t"
-    "mov.d      $f8, $f4                                  \n\t"
-    "dmtc1      %[iAlpha], $f16                           \n\t"
-    "and        %[iAlpha], $8, %[iBeta]                   \n\t"
-    "dmtc1      %[iAlpha], $f20                           \n\t"
-    "mov.d      $f24, $f20                                \n\t"
-    "mov.d      $f28, $f20                                \n\t"
-    "gssqc1     $f2, $f0, 432-336(%[tmp])                 \n\t"
-    "dmtc1      %[iAlpha], $f0                            \n\t"
-
-    "lb         %[iAlpha], 0x3(%[pTC])                    \n\t"
-    "lb         %[pTC], 0x2(%[pTC])                       \n\t"
-    "dmtc1      $10, $f12                                 \n\t"
-    "punpcklhw  $f0, $f0, $f16                            \n\t"
-    "and        $8, %[iAlpha], %[iBeta]                   \n\t"
-    "punpcklhw  $f24, $f24, $f8                           \n\t"
-    "punpcklhw  $f20, $f20, $f4                           \n\t"
-    "punpcklhw  $f0, $f0, $f24                            \n\t"
-    "punpcklhw  $f28, $f28, $f12                          \n\t"
-    "punpcklhw  $f28, $f28, $f20                          \n\t"
-    "punpckhhw  $f2, $f0, $f28                            \n\t"
-    "punpcklhw  $f0, $f0, $f28                            \n\t"
-    "gssqc1     $f2, $f0, 432-400(%[tmp])                 \n\t"
-    "dmtc1      $8, $f0                                   \n\t"
-    "and        %[iAlpha], %[iAlpha], %[iBeta]            \n\t"
-    "mov.d      $f8, $f0                                  \n\t"
-    "dmtc1      %[iAlpha], $f16                           \n\t"
-    "and        %[iAlpha], %[pTC], %[iBeta]               \n\t"
-    "dmtc1      $8, $f12                                  \n\t"
-    "dmtc1      %[iAlpha], $f20                           \n\t"
-    "punpcklhw  $f20, $f20, $f0                           \n\t"
-
-    "xor        $f0, $f0, $f0                             \n\t"
-    "dmtc1      %[iAlpha], $f24                           \n\t"
-    "and        %[pTC], %[pTC], %[iBeta]                  \n\t"
-    "punpcklhw  $f24, $f24, $f8                           \n\t"
-    "dmtc1      %[iAlpha], $f28                           \n\t"
-    "dmtc1      %[pTC], $f4                               \n\t"
-
-    "gslqc1     $f10, $f8, 0x0($9)                        \n\t"
-    "punpckhbh  $f10, $f8, $f0                            \n\t"
-    "punpcklbh  $f8, $f8, $f0                             \n\t"
-
-    "dli        %[iAlpha], 0x4                            \n\t"
-    "seh        %[pTC], %[iAlpha]                         \n\t"
-    "punpcklhw  $f28, $f28, $f12                          \n\t"
-    "punpcklhw  $f28, $f28, $f20                          \n\t"
-    "gslqc1     $f22, $f20, 0x0(%[iStride])               \n\t"
-    "gslqc1     $f14, $f12, 0x0($13)                      \n\t"
-    "gsldxc1    $f2, 0x0($12, $0)                         \n\t"
-    "punpckhbh  $f22, $f20, $f0                           \n\t"
-    "punpcklbh  $f20, $f20, $f0                           \n\t"
-    "gssqc1     $f22, $f20, 432-240(%[tmp])               \n\t"
-    "punpckhbh  $f22, $f2, $f0                            \n\t"
-    "punpcklbh  $f20, $f2, $f0                            \n\t"
-    "gssqc1     $f22, $f20, 432-352(%[tmp])               \n\t"
-    "punpcklhw  $f4, $f4, $f16                            \n\t"
-    "gslqc1     $f18, $f16, 0x0($14)                      \n\t"
-    "punpcklhw  $f4, $f4, $f24                            \n\t"
-    "gslqc1     $f26, $f24, 0x0(%[pPix])                  \n\t"
-    "punpckhhw  $f6, $f4, $f28                            \n\t"
-    "punpcklhw  $f4, $f4, $f28                            \n\t"
-    "punpckhbh  $f26, $f24, $f0                           \n\t"
-    "punpcklbh  $f24, $f24, $f0                           \n\t"
-    "punpckhbh  $f14, $f12, $f0                           \n\t"
-    "punpcklbh  $f12, $f12, $f0                           \n\t"
-    "punpckhbh  $f18, $f16, $f0                           \n\t"
-    "punpcklbh  $f16, $f16, $f0                           \n\t"
-    "psubh      $f28, $f12, $f16                          \n\t"
-    "psubh      $f30, $f14, $f18                          \n\t"
-    "gssqc1     $f18, $f16, 432-272(%[tmp])               \n\t"
-    WELS_AbsH($f28, $f30, $f28, $f30, $f16, $f18)
-    "gslqc1     $f18, $f16, 432-336(%[tmp])               \n\t"
-    "gslqc1     $f2, $f0, 432-352(%[tmp])                 \n\t"
-    "pcmpgth    $f20, $f16, $f28                          \n\t"
-    "pcmpgth    $f22, $f18, $f30                          \n\t"
-    "gssqc1     $f22, $f20, 432-288(%[tmp])               \n\t"
-    "psubh      $f28, $f24, $f0                           \n\t"
-    "psubh      $f30, $f26, $f2                           \n\t"
-    WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
-    "pcmpgth    $f20, $f16, $f28                          \n\t"
-    "pcmpgth    $f22, $f18, $f30                          \n\t"
-    "gssqc1     $f22, $f20, 432-256(%[tmp])               \n\t"
-    "pavgh      $f20, $f12, $f24                          \n\t"
-    "pavgh      $f22, $f14, $f26                          \n\t"
-    "gssqc1     $f22, $f20, 432-304(%[tmp])               \n\t"
-    "gslqc1     $f22, $f20, 432-400(%[tmp])               \n\t"
-    "gslqc1     $f30, $f28, 432-288(%[tmp])               \n\t"
-    "gslqc1     $f2, $f0, 432-256(%[tmp])                 \n\t"
-    "psubh      $f20, $f20, $f28                          \n\t"
-    "psubh      $f22, $f22, $f30                          \n\t"
-    "psubh      $f20, $f20, $f0                           \n\t"
-    "psubh      $f22, $f22, $f2                           \n\t"
-    "gssqc1     $f22, $f20, 432-224(%[tmp])               \n\t"
-    "gslqc1     $f2, $f0, 432-240(%[tmp])                 \n\t"
-    "psubh      $f20, $f24, $f12                          \n\t"
-    "psubh      $f22, $f26, $f14                          \n\t"
-    "gssqc1     $f26, $f24, 432-32(%[tmp])                \n\t"
-    "psubh      $f24, $f24, $f0                           \n\t"
-    "psubh      $f26, $f26, $f2                           \n\t"
-    "gssqc1     $f22, $f20, 432-384(%[tmp])               \n\t"
-    WELS_AbsH($f28, $f30, $f20, $f22, $f28, $f30)
-    "gslqc1     $f22, $f20, 432-112(%[tmp])               \n\t"
-    "pcmpgth    $f20, $f20, $f28                          \n\t"
-    "pcmpgth    $f22, $f22, $f30                          \n\t"
-    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
-    "pcmpgth    $f28, $f16, $f24                          \n\t"
-    "pcmpgth    $f30, $f18, $f26                          \n\t"
-
-    "xor        $f0, $f0, $f0                             \n\t"
-    "and        $f20, $f20, $f28                          \n\t"
-    "and        $f22, $f22, $f30                          \n\t"
-    "psubh      $f24, $f12, $f8                           \n\t"
-    "psubh      $f26, $f14, $f10                          \n\t"
-    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
-    "pcmpgth    $f28, $f16, $f24                          \n\t"
-    "pcmpgth    $f30, $f18, $f26                          \n\t"
-    "gslqc1     $f26, $f24, 432-400(%[tmp])               \n\t"
-    "and        $f20, $f20, $f28                          \n\t"
-    "and        $f22, $f22, $f30                          \n\t"
-    "pcmpgth    $f28, $f24, $f0                           \n\t"
-    "pcmpgth    $f30, $f26, $f0                           \n\t"
-    "pcmpeqh    $f24, $f24, $f0                           \n\t"
-    "pcmpeqh    $f26, $f26, $f0                           \n\t"
-    "or         $f28, $f28, $f24                          \n\t"
-    "or         $f30, $f30, $f26                          \n\t"
-    "and        $f20, $f20, $f28                          \n\t"
-    "and        $f22, $f22, $f30                          \n\t"
-    "gssqc1     $f22, $f20, 432-320(%[tmp])               \n\t"
-    "dmtc1      %[pTC], $f20                              \n\t"
-    "punpckhhw  $f26, $f20, $f20                          \n\t"
-    "punpcklhw  $f24, $f20, $f20                          \n\t"
-    "punpcklwd  $f20, $f24, $f24                          \n\t"
-    "mov.d      $f22, $f20                                \n\t"
-    "gssqc1     $f22, $f20, 432-336(%[tmp])               \n\t"
-    "gslqc1     $f22, $f20, 432-224(%[tmp])               \n\t"
-    "psubh      $f24, $f0, $f20                           \n\t"
-    "dli        $11, 0x2                                  \n\t"
-    "psubh      $f26, $f0, $f22                           \n\t"
-    "dmtc1      $11, $f28                                 \n\t"
-    "gslqc1     $f22, $f20, 432-384(%[tmp])               \n\t"
-    "gslqc1     $f2, $f0, 432-240(%[tmp])                 \n\t"
-    "psllh      $f20, $f20, $f28                          \n\t"
-    "psllh      $f22, $f22, $f28                          \n\t"
-    "psubh      $f28, $f8, $f0                            \n\t"
-    "psubh      $f30, $f10, $f2                           \n\t"
-    "paddh      $f28, $f28, $f20                          \n\t"
-    "paddh      $f30, $f30, $f22                          \n\t"
-    "gslqc1     $f22, $f20, 432-336(%[tmp])               \n\t"
-    "paddh      $f28, $f28, $f20                          \n\t"
-    "paddh      $f30, $f30, $f22                          \n\t"
-    "dli        $11, 0x3                                  \n\t"
-    "dmtc1      $11, $f20                                 \n\t"
-    "psrah      $f28, $f28, $f20                          \n\t"
-    "psrah      $f30, $f30, $f20                          \n\t"
-    "gslqc1     $f22, $f20, 432-224(%[tmp])               \n\t"
-    "pmaxsh     $f24, $f24, $f28                          \n\t"
-    "pmaxsh     $f26, $f26, $f30                          \n\t"
-    "gslqc1     $f2, $f0, 432-320(%[tmp])                 \n\t"
-    "pminsh     $f20, $f20, $f24                          \n\t"
-    "pminsh     $f22, $f22, $f26                          \n\t"
-
-    "and        $f20, $f20, $f0                           \n\t"
-    "and        $f22, $f22, $f2                           \n\t"
-    "gslqc1     $f26, $f24, 432-400(%[tmp])               \n\t"
-    "gssqc1     $f22, $f20, 432-64(%[tmp])                \n\t"
-    "xor        $f0, $f0, $f0                             \n\t"
-    "gssqc1     $f26, $f24, 432-384(%[tmp])               \n\t"
-    "psubh      $f20, $f0, $f24                           \n\t"
-    "psubh      $f22, $f0, $f26                           \n\t"
-    "gssqc1     $f22, $f20, 432-368(%[tmp])               \n\t"
-    "mov.d      $f24, $f20                                \n\t"
-    "mov.d      $f26, $f22                                \n\t"
-    "gslqc1     $f22, $f20, 432-272(%[tmp])               \n\t"
-    "gslqc1     $f30, $f28, 432-304(%[tmp])               \n\t"
-    "paddh      $f20, $f20, $f28                          \n\t"
-    "paddh      $f22, $f22, $f30                          \n\t"
-    "paddh      $f28, $f8, $f8                            \n\t"
-    "paddh      $f30, $f10, $f10                          \n\t"
-    "psubh      $f20, $f20, $f28                          \n\t"
-    "psubh      $f22, $f22, $f30                          \n\t"
-    "dli        $11, 0x1                                  \n\t"
-    "dmtc1      $11, $f28                                 \n\t"
-    "psrah      $f20, $f20, $f28                          \n\t"
-    "psrah      $f22, $f22, $f28                          \n\t"
-    "pmaxsh     $f24, $f24, $f20                          \n\t"
-    "pmaxsh     $f26, $f26, $f22                          \n\t"
-    "gslqc1     $f22, $f20, 432-384(%[tmp])               \n\t"
-    "pminsh     $f20, $f20, $f24                          \n\t"
-    "pminsh     $f22, $f22, $f26                          \n\t"
-
-    "gslqc1     $f26, $f24, 432-320(%[tmp])               \n\t"
-    "gslqc1     $f30, $f28, 432-288(%[tmp])               \n\t"
-    "and        $f20, $f20, $f24                          \n\t"
-    "and        $f22, $f22, $f26                          \n\t"
-    "and        $f20, $f20, $f28                          \n\t"
-    "and        $f22, $f22, $f30                          \n\t"
-    "gslqc1     $f26, $f24, 432-240(%[tmp])               \n\t"
-    "gssqc1     $f22, $f20, 432-96(%[tmp])                \n\t"
-    "gslqc1     $f22, $f20, 432-352(%[tmp])               \n\t"
-    "gslqc1     $f30, $f28, 432-304(%[tmp])               \n\t"
-    "paddh      $f20, $f20, $f28                          \n\t"
-    "paddh      $f22, $f22, $f30                          \n\t"
-    "paddh      $f28, $f24, $f24                          \n\t"
-    "paddh      $f30, $f26, $f26                          \n\t"
-    "gslqc1     $f26, $f24, 432-368(%[tmp])               \n\t"
-    "dli        $11, 0x1                                  \n\t"
-    "psubh      $f20, $f20, $f28                          \n\t"
-    "dmtc1      $11, $f28                                 \n\t"
-    "psubh      $f22, $f22, $f30                          \n\t"
-
-    "psrah      $f20, $f20, $f28                          \n\t"
-    "psrah      $f22, $f22, $f28                          \n\t"
-    "gslqc1     $f30, $f28, 0x0(%[iStride])               \n\t"
-    "pmaxsh     $f24, $f24, $f20                          \n\t"
-    "pmaxsh     $f26, $f26, $f22                          \n\t"
-    "gslqc1     $f22, $f20, 432-400(%[tmp])               \n\t"
-    "pminsh     $f20, $f20, $f24                          \n\t"
-    "pminsh     $f22, $f22, $f26                          \n\t"
-    "gslqc1     $f26, $f24, 432-320(%[tmp])               \n\t"
-    "and        $f20, $f20, $f24                          \n\t"
-    "and        $f22, $f22, $f26                          \n\t"
-    "gslqc1     $f26, $f24, 432-256(%[tmp])               \n\t"
-    "and        $f20, $f20, $f24                          \n\t"
-    "and        $f22, $f22, $f26                          \n\t"
-    "gslqc1     $f26, $f24, 0x0($9)                       \n\t"
-    "punpcklbh  $f28, $f30, $f0                           \n\t"
-    "punpckhbh  $f30, $f30, $f0                           \n\t"
-    "gssqc1     $f30, $f28, 432-352(%[tmp])               \n\t"
-
-    "gslqc1     $f30, $f28, 0x0($12)                      \n\t"
-    "punpcklbh  $f24, $f26, $f0                           \n\t"
-    "punpckhbh  $f26, $f26, $f0                           \n\t"
-    "gssqc1     $f22, $f20, 432-48(%[tmp])                \n\t"
-    "gslqc1     $f22, $f20, 0x0($14)                      \n\t"
-    "gssqc1     $f26, $f24, 432-368(%[tmp])               \n\t"
-    "gslqc1     $f26, $f24, 0x0($13)                      \n\t"
-    "punpcklbh  $f28, $f30, $f0                           \n\t"
-    "punpckhbh  $f30, $f30, $f0                           \n\t"
-    "punpcklbh  $f20, $f22, $f0                           \n\t"
-    "punpckhbh  $f22, $f22, $f0                           \n\t"
-    "gssqc1     $f30, $f28, 432-384(%[tmp])               \n\t"
-    "punpcklbh  $f24, $f26, $f0                           \n\t"
-    "punpckhbh  $f26, $f26, $f0                           \n\t"
-    "gssqc1     $f26, $f24, 432-400(%[tmp])               \n\t"
-
-    "gslqc1     $f30, $f28, 432-400(%[tmp])               \n\t"
-    "gslqc1     $f26, $f24, 0x0(%[pPix])                  \n\t"
-    "psubh      $f28, $f28, $f20                          \n\t"
-    "psubh      $f30, $f30, $f22                          \n\t"
-    "gssqc1     $f22, $f20, 432-16(%[tmp])                \n\t"
-    WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
-    "punpcklbh  $f24, $f26, $f0                           \n\t"
-    "punpckhbh  $f26, $f26, $f0                           \n\t"
-    "pcmpgth    $f20, $f16, $f28                          \n\t"
-    "pcmpgth    $f22, $f18, $f30                          \n\t"
-    "gslqc1     $f30, $f28, 432-384(%[tmp])               \n\t"
-    "gssqc1     $f22, $f20, 432-288(%[tmp])               \n\t"
-
-    "psubh      $f28, $f24, $f28                          \n\t"
-    "psubh      $f30, $f26, $f30                          \n\t"
-    WELS_AbsH($f28, $f30, $f28, $f30, $f20, $f22)
-    "pcmpgth    $f20, $f16, $f28                          \n\t"
-    "pcmpgth    $f22, $f18, $f30                          \n\t"
-    "gssqc1     $f22, $f20, 432-256(%[tmp])               \n\t"
-
-    "gslqc1     $f22, $f20, 432-400(%[tmp])               \n\t"
-    "gssqc1     $f26, $f24, 432-80(%[tmp])                \n\t"
-    "pavgh      $f20, $f20, $f24                          \n\t"
-    "pavgh      $f22, $f22, $f26                          \n\t"
-    "gssqc1     $f22, $f20, 432-304(%[tmp])               \n\t"
-
-    "gslqc1     $f22, $f20, 432-288(%[tmp])               \n\t"
-    "gslqc1     $f30, $f28, 432-256(%[tmp])               \n\t"
-    "psubh      $f20, $f4, $f20                           \n\t"
-    "psubh      $f22, $f6, $f22                           \n\t"
-    "psubh      $f20, $f20, $f28                          \n\t"
-    "psubh      $f22, $f22, $f30                          \n\t"
-    "gssqc1     $f22, $f20, 432-224(%[tmp])               \n\t"
-    "gslqc1     $f22, $f20, 432-400(%[tmp])               \n\t"
-    "gslqc1     $f30, $f28, 432-352(%[tmp])               \n\t"
-    "psubh      $f20, $f24, $f20                          \n\t"
-    "psubh      $f22, $f26, $f22                          \n\t"
-    "psubh      $f24, $f24, $f28                          \n\t"
-    "psubh      $f26, $f26, $f30                          \n\t"
-    "gssqc1     $f22, $f20, 432-272(%[tmp])               \n\t"
-    "mov.d      $f28, $f20                                \n\t"
-    "mov.d      $f30, $f22                                \n\t"
-    WELS_AbsH($f28, $f30, $f20, $f22, $f0, $f2)
-    "gslqc1     $f22, $f20, 432-112(%[tmp])               \n\t"
-    "pcmpgth    $f20, $f20, $f28                          \n\t"
-    "pcmpgth    $f22, $f22, $f30                          \n\t"
-    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
-    "pcmpgth    $f28, $f16, $f24                          \n\t"
-    "pcmpgth    $f30, $f18, $f26                          \n\t"
-    "gslqc1     $f26, $f24, 432-368(%[tmp])               \n\t"
-
-    "and        $f20, $f20, $f28                          \n\t"
-    "and        $f22, $f22, $f30                          \n\t"
-    "gslqc1     $f30, $f28, 432-400(%[tmp])               \n\t"
-    "psubh      $f28, $f28, $f24                          \n\t"
-    "psubh      $f30, $f30, $f26                          \n\t"
-    "gslqc1     $f2, $f0, 432-352(%[tmp])                 \n\t"
-    "psubh      $f24, $f24, $f0                           \n\t"
-    "psubh      $f26, $f26, $f2                           \n\t"
-    WELS_AbsH($f28, $f30, $f28, $f30, $f0, $f2)
-    "pcmpgth    $f16, $f16, $f28                          \n\t"
-    "pcmpgth    $f18, $f18, $f30                          \n\t"
-    "gslqc1     $f30, $f28, 432-96(%[tmp])                \n\t"
-    "and        $f20, $f20, $f16                          \n\t"
-    "and        $f22, $f22, $f18                          \n\t"
-    "xor        $f0, $f0, $f0                             \n\t"
-
-    "paddh      $f8, $f8, $f28                            \n\t"
-    "paddh      $f10, $f10, $f30                          \n\t"
-    "pcmpgth    $f16, $f4, $f0                            \n\t"
-    "pcmpgth    $f18, $f6, $f0                            \n\t"
-    "pcmpeqh    $f28, $f4, $f0                            \n\t"
-    "pcmpeqh    $f30, $f6, $f0                            \n\t"
-    "or         $f16, $f16, $f28                          \n\t"
-    "or         $f18, $f18, $f30                          \n\t"
-    "and        $f20, $f20, $f16                          \n\t"
-    "and        $f22, $f22, $f18                          \n\t"
-    "gslqc1     $f18, $f16, 432-224(%[tmp])               \n\t"
-    "gssqc1     $f22, $f20, 432-320(%[tmp])               \n\t"
-    "gslqc1     $f22, $f20, 432-272(%[tmp])               \n\t"
-    "dli        $11, 0x2                                  \n\t"
-    "psubh      $f28, $f0, $f16                           \n\t"
-    "psubh      $f30, $f0, $f18                           \n\t"
-    "psubh      $f2, $f0, $f6                             \n\t"
-    "psubh      $f0, $f0, $f4                             \n\t"
-    "dmfc1      %[iAlpha], $f28                           \n\t"
-    "dmtc1      $11, $f28                                 \n\t"
-    "psllh      $f20, $f20, $f28                          \n\t"
-    "psllh      $f22, $f22, $f28                          \n\t"
-    "dmtc1      %[iAlpha], $f28                           \n\t"
-    "paddh      $f24, $f24, $f20                          \n\t"
-    "paddh      $f26, $f26, $f22                          \n\t"
-    "gslqc1     $f22, $f20, 432-336(%[tmp])               \n\t"
-    "paddh      $f24, $f24, $f20                          \n\t"
-    "paddh      $f26, $f26, $f22                          \n\t"
-    "gslqc1     $f22, $f20, 432-368(%[tmp])               \n\t"
-    "dli        $11, 0x3                                  \n\t"
-    "gssqc1     $f2, $f0, 432-336(%[tmp])                 \n\t"
-    "dmfc1      %[iAlpha], $f0                            \n\t"
-    "dmtc1      $11, $f0                                  \n\t"
-    "psrah      $f24, $f24, $f0                           \n\t"
-    "psrah      $f26, $f26, $f0                           \n\t"
-    "dmtc1      %[iAlpha], $f0                            \n\t"
-    "pmaxsh     $f28, $f28, $f24                          \n\t"
-    "pmaxsh     $f30, $f30, $f26                          \n\t"
-    "pminsh     $f16, $f16, $f28                          \n\t"
-    "pminsh     $f18, $f18, $f30                          \n\t"
-    "gslqc1     $f30, $f28, 432-320(%[tmp])               \n\t"
-    "and        $f16, $f16, $f28                          \n\t"
-    "and        $f18, $f18, $f30                          \n\t"
-    "mov.d      $f24, $f0                                 \n\t"
-    "mov.d      $f26, $f2                                 \n\t"
-    "gslqc1     $f2, $f0, 432-16(%[tmp])                  \n\t"
-    "gslqc1     $f30, $f28, 432-304(%[tmp])               \n\t"
-    "paddh      $f0, $f0, $f28                            \n\t"
-    "paddh      $f2, $f2, $f30                            \n\t"
-    "gssqc1     $f18, $f16, 432-272(%[tmp])               \n\t"
-    "gslqc1     $f18, $f16, 432-368(%[tmp])               \n\t"
-    "dli        $11, 0x1                                  \n\t"
-    "paddh      $f16, $f16, $f16                          \n\t"
-    "paddh      $f18, $f18, $f18                          \n\t"
-    "psubh      $f0, $f0, $f16                            \n\t"
-    "psubh      $f2, $f2, $f18                            \n\t"
-
-    "dmtc1      $11, $f28                                 \n\t"
-    "gslqc1     $f18, $f16, 432-64(%[tmp])                \n\t"
-    "psrah      $f0, $f0, $f28                            \n\t"
-    "psrah      $f2, $f2, $f28                            \n\t"
-    "pmaxsh     $f24, $f24, $f0                           \n\t"
-    "pmaxsh     $f26, $f26, $f2                           \n\t"
-    "gslqc1     $f2, $f0, 432-400(%[tmp])                 \n\t"
-    "pminsh     $f28, $f4, $f24                           \n\t"
-    "pminsh     $f30, $f6, $f26                           \n\t"
-    "gslqc1     $f26, $f24, 432-320(%[tmp])               \n\t"
-    "and        $f28, $f28, $f24                          \n\t"
-    "and        $f30, $f30, $f26                          \n\t"
-    "dmfc1      %[iAlpha], $f24                           \n\t"
-    "dmfc1      %[iBeta], $f26                            \n\t"
-    "gslqc1     $f26, $f24, 432-288(%[tmp])               \n\t"
-    "and        $f28, $f28, $f24                          \n\t"
-    "and        $f30, $f30, $f26                          \n\t"
-    "paddh      $f20, $f20, $f28                          \n\t"
-    "paddh      $f22, $f22, $f30                          \n\t"
-    "packushb   $f8, $f8, $f10                            \n\t"
-    "packushb   $f10, $f20, $f22                          \n\t"
-    "gslqc1     $f22, $f20, 432-272(%[tmp])               \n\t"
-    "paddh      $f0, $f0, $f20                            \n\t"
-    "paddh      $f2, $f2, $f22                            \n\t"
-    "paddh      $f12, $f12, $f16                          \n\t"
-    "paddh      $f14, $f14, $f18                          \n\t"
-    "packushb   $f12, $f12, $f14                          \n\t"
-    "packushb   $f14, $f0, $f2                            \n\t"
-
-    "gslqc1     $f2, $f0, 432-32(%[tmp])                  \n\t"
-    "psubh      $f0, $f0, $f16                            \n\t"
-    "psubh      $f2, $f2, $f18                            \n\t"
-    "gslqc1     $f18, $f16, 432-80(%[tmp])                \n\t"
-    "psubh      $f16, $f16, $f20                          \n\t"
-    "gslqc1     $f26, $f24, 432-48(%[tmp])                \n\t"
-    "psubh      $f18, $f18, $f22                          \n\t"
-
-    "gslqc1     $f22, $f20, 432-240(%[tmp])               \n\t"
-    "paddh      $f20, $f20, $f24                          \n\t"
-    "paddh      $f22, $f22, $f26                          \n\t"
-    "gslqc1     $f26, $f24, 432-304(%[tmp])               \n\t"
-    "packushb   $f0, $f0, $f2                             \n\t"
-    "packushb   $f2, $f16, $f18                           \n\t"
-    "gslqc1     $f18, $f16, 432-384(%[tmp])               \n\t"
-    "paddh      $f16, $f16, $f24                          \n\t"
-    "paddh      $f18, $f18, $f26                          \n\t"
-    "gssqc1     $f2, $f0, 480-208(%[tmp])                 \n\t"
-    "gslqc1     $f2, $f0, 432-352(%[tmp])                 \n\t"
-    "mov.d      $f28, $f0                                 \n\t"
-    "mov.d      $f30, $f2                                 \n\t"
-    "paddh      $f0, $f0, $f0                             \n\t"
-    "paddh      $f2, $f2, $f2                             \n\t"
-
-    "dmtc1      %[iAlpha], $f24                           \n\t"
-    "dmtc1      %[iBeta], $f26                            \n\t"
-
-    "psubh      $f16, $f16, $f0                           \n\t"
-    "psubh      $f18, $f18, $f2                           \n\t"
-    "dli        $11, 0x1                                  \n\t"
-    "gslqc1     $f2, $f0, 432-336(%[tmp])                 \n\t"
-    "gssqc1     $f10, $f8, 0x0($9)                        \n\t"
-    "dmtc1      $11, $f8                                  \n\t"
-    "psrah      $f16, $f16, $f8                           \n\t"
-    "psrah      $f18, $f18, $f8                           \n\t"
-    "pmaxsh     $f0, $f0, $f16                            \n\t"
-    "pmaxsh     $f2, $f2, $f18                            \n\t"
-    "pminsh     $f4, $f4, $f0                             \n\t"
-    "pminsh     $f6, $f6, $f2                             \n\t"
-    "gslqc1     $f2, $f0, 480-208(%[tmp])                 \n\t"
-
-    "gslqc1     $f10, $f8, 428-256+4(%[tmp])              \n\t"
-    "and        $f4, $f4, $f24                            \n\t"
-    "and        $f6, $f6, $f26                            \n\t"
-    "and        $f4, $f4, $f8                             \n\t"
-    "and        $f6, $f6, $f10                            \n\t"
-    "gssqc1     $f14, $f12, 0x0($13)                      \n\t"
-    "paddh      $f28, $f28, $f4                           \n\t"
-    "paddh      $f30, $f30, $f6                           \n\t"
-    "packushb   $f20, $f20, $f22                          \n\t"
-    "packushb   $f22, $f28, $f30                          \n\t"
-    "gssqc1     $f2, $f0, 0x0(%[pPix])                    \n\t"
-    "gssqc1     $f22, $f20, 0x0(%[iStride])               \n\t"
-    : [pPix]"+&r"((unsigned char *)pPix)
-    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
-      [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
-    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
-      "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
-      "$f22", "$f24", "$f26", "$f28", "$f30"
-  );
-  RECOVER_REG;
-}
-
-void DeblockLumaTransposeH2V_mmi(uint8_t *pPixY, int32_t iStride,
-                                 uint8_t *pDst) {
-  BACKUP_REG;
-  __asm__ volatile(
-    ".set       arch=loongson3a                           \n\t"
-    "dsll       $8, %[iStride], 0x3                       \n\t"
-    "daddu      $8, $8, %[pPixY]                          \n\t"
-
-    "daddu      $9, %[pPixY], %[iStride]                  \n\t"
-    "daddu      $10, $8, %[iStride]                       \n\t"
-    "gsldlc1    $f0, 0x7(%[pPixY])                        \n\t"
-    "gsldlc1    $f2, 0x7($8)                              \n\t"
-    "gsldlc1    $f4, 0x7($9)                              \n\t"
-    "gsldlc1    $f6, 0x7($10)                             \n\t"
-    "gsldrc1    $f0, 0x0(%[pPixY])                        \n\t"
-    "gsldrc1    $f2, 0x0($8)                              \n\t"
-    "gsldrc1    $f4, 0x0($9)                              \n\t"
-    "gsldrc1    $f6, 0x0($10)                             \n\t"
-    "daddu      %[pPixY], $9, %[iStride]                  \n\t"
-    "daddu      $8, $10, %[iStride]                       \n\t"
-    "daddu      $9, %[pPixY], %[iStride]                  \n\t"
-    "daddu      $10, $8, %[iStride]                       \n\t"
-    "gsldlc1    $f8, 0x7(%[pPixY])                        \n\t"
-    "gsldlc1    $f10, 0x7($8)                             \n\t"
-    "gsldlc1    $f12, 0x7($9)                             \n\t"
-    "gsldlc1    $f14, 0x7($10)                            \n\t"
-    "gsldrc1    $f8, 0x0(%[pPixY])                        \n\t"
-    "gsldrc1    $f10, 0x0($8)                             \n\t"
-    "gsldrc1    $f12, 0x0($9)                             \n\t"
-    "gsldrc1    $f14, 0x0($10)                            \n\t"
-
-    "daddu      %[pPixY], $9, %[iStride]                  \n\t"
-    "daddu      $8, $10, %[iStride]                       \n\t"
-    "daddu      $9, %[pPixY], %[iStride]                  \n\t"
-    "daddu      $10, $8, %[iStride]                       \n\t"
-    "gsldlc1    $f16, 0x7(%[pPixY])                       \n\t"
-    "gsldlc1    $f18, 0x7($8)                             \n\t"
-    "gsldlc1    $f20, 0x7($9)                             \n\t"
-    "gsldlc1    $f22, 0x7($10)                            \n\t"
-    "gsldrc1    $f16, 0x0(%[pPixY])                       \n\t"
-    "gsldrc1    $f18, 0x0($8)                             \n\t"
-    "gsldrc1    $f20, 0x0($9)                             \n\t"
-    "gsldrc1    $f22, 0x0($10)                            \n\t"
-    "daddu      %[pPixY], $9, %[iStride]                  \n\t"
-    "daddu      $8, $10, %[iStride]                       \n\t"
-    "daddu      $9, %[pPixY], %[iStride]                  \n\t"
-    "daddu      $10, $8, %[iStride]                       \n\t"
-    "gsldlc1    $f24, 0x7(%[pPixY])                       \n\t"
-    "gsldlc1    $f26, 0x7($8)                             \n\t"
-
-    "gsldlc1    $f28, 0x7($9)                             \n\t"
-    "gsldlc1    $f30, 0x7($10)                            \n\t"
-    "gsldrc1    $f24, 0x0(%[pPixY])                       \n\t"
-    "gsldrc1    $f26, 0x0($8)                             \n\t"
-    "gsldrc1    $f28, 0x0($9)                             \n\t"
-    "gsldrc1    $f30, 0x0($10)                            \n\t"
-
-    MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
-                     $f14, $f16, $f18, $f20, $f22, $f24,
-                     $f26, $f28, $f30, $9, $10)
-
-    "gssqc1     $f18, $f16, 0x0(%[pDst])                  \n\t"
-    "gssqc1     $f10, $f8, 0x10(%[pDst])                  \n\t"
-    "gssqc1     $f14, $f12, 0x20(%[pDst])                 \n\t"
-    "gssqc1     $f30, $f28, 0x30(%[pDst])                 \n\t"
-    "gssqc1     $f22, $f20, 0x40(%[pDst])                 \n\t"
-    "gssqc1     $f6, $f4, 0x50(%[pDst])                   \n\t"
-    "gssqc1     $f26, $f24, 0x60(%[pDst])                 \n\t"
-    "gssqc1     $f2, $f0, 0x70(%[pDst])                   \n\t"
-    : [pPixY] "+&r"((unsigned char *)pPixY)
-    : [iStride] "r"((int)iStride), [pDst] "r"((unsigned char *)pDst)
-    : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
-      "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
-      "$f30"
-  );
-  RECOVER_REG;
-}
-
-void DeblockLumaTransposeV2H_mmi(uint8_t *pPixY, int32_t iStride,
-                                 uint8_t *pSrc) {
-  BACKUP_REG;
-  __asm__ volatile(
-    ".set       arch=loongson3a                           \n\t"
-    "gslqc1     $f2, $f0, 0x0(%[pSrc])                    \n\t"
-    "gslqc1     $f6, $f4, 0x10(%[pSrc])                   \n\t"
-    "gslqc1     $f10, $f8, 0x20(%[pSrc])                  \n\t"
-    "gslqc1     $f14, $f12, 0x30(%[pSrc])                 \n\t"
-    "gslqc1     $f18, $f16, 0x40(%[pSrc])                 \n\t"
-    "gslqc1     $f22, $f20, 0x50(%[pSrc])                 \n\t"
-    "gslqc1     $f26, $f24, 0x60(%[pSrc])                 \n\t"
-    "gslqc1     $f30, $f28, 0x70(%[pSrc])                 \n\t"
-
-    MMI_TransTwo8x8B($f0, $f2, $f4, $f6, $f8, $f10, $f12,
-                     $f14, $f16, $f18, $f20, $f22, $f24,
-                     $f26, $f28, $f30, $9, $10)
-
-    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
-    "gssdlc1    $f16, 0x7(%[pPixY])                       \n\t"
-    "gssdlc1    $f8, 0x7($8)                              \n\t"
-    "gssdrc1    $f16, 0x0(%[pPixY])                       \n\t"
-    "gssdrc1    $f8, 0x0($8)                              \n\t"
-    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
-    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
-    "gssdlc1    $f12, 0x7(%[pPixY])                       \n\t"
-    "gssdlc1    $f28, 0x7($8)                             \n\t"
-    "gssdrc1    $f12, 0x0(%[pPixY])                       \n\t"
-    "gssdrc1    $f28, 0x0($8)                             \n\t"
-
-    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
-    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
-    "gssdlc1    $f20, 0x7(%[pPixY])                       \n\t"
-    "gssdlc1    $f4, 0x7($8)                              \n\t"
-    "gssdrc1    $f20, 0x0(%[pPixY])                       \n\t"
-    "gssdrc1    $f4, 0x0($8)                              \n\t"
-    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
-    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
-    "gssdlc1    $f24, 0x7(%[pPixY])                       \n\t"
-    "gssdlc1    $f0, 0x7($8)                              \n\t"
-    "gssdrc1    $f24, 0x0(%[pPixY])                       \n\t"
-    "gssdrc1    $f0, 0x0($8)                              \n\t"
-
-    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
-    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
-    "gssdlc1    $f18, 0x7(%[pPixY])                       \n\t"
-    "gssdlc1    $f10, 0x7($8)                             \n\t"
-    "gssdrc1    $f18, 0x0(%[pPixY])                       \n\t"
-    "gssdrc1    $f10, 0x0($8)                             \n\t"
-    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
-    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
-    "gssdlc1    $f14, 0x7(%[pPixY])                       \n\t"
-    "gssdlc1    $f30, 0x7($8)                             \n\t"
-    "gssdrc1    $f14, 0x0(%[pPixY])                       \n\t"
-    "gssdrc1    $f30, 0x0($8)                             \n\t"
-
-    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
-    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
-    "gssdlc1    $f22, 0x7(%[pPixY])                       \n\t"
-    "gssdlc1    $f6, 0x7($8)                              \n\t"
-    "gssdrc1    $f22, 0x0(%[pPixY])                       \n\t"
-    "gssdrc1    $f6, 0x0($8)                              \n\t"
-    "daddu      %[pPixY], $8, %[iStride]                  \n\t"
-    "daddu      $8, %[pPixY], %[iStride]                  \n\t"
-    "gssdlc1    $f26, 0x7(%[pPixY])                       \n\t"
-    "gssdlc1    $f2, 0x7($8)                              \n\t"
-    "gssdrc1    $f26, 0x0(%[pPixY])                       \n\t"
-    "gssdrc1    $f2, 0x0($8)                              \n\t"
-    : [pPixY] "+&r"((unsigned char *)pPixY)
-    : [iStride] "r"((int)iStride), [pSrc] "r"((unsigned char *)pSrc)
-    : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
-      "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28",
-      "$f30"
-  );
-  RECOVER_REG;
-}
-
-void DeblockLumaEq4V_mmi(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
-                         int32_t iBeta) {
-  unsigned char tmp[720] __attribute__((aligned(32)));
-  BACKUP_REG;
-  __asm__ volatile (
-    ".set       arch=loongson3a                           \n\t"
-    "dsll       $11, %[iStride], 0x2                      \n\t"
-    "xor        $f8, $f8, $f8                             \n\t"
-    "daddu      $14, %[iStride], %[pPix]                  \n\t"
-    "dsubu      $8, %[pPix], $11                          \n\t"
-    "gslqc1     $f14, $f12, 0x0($8)                       \n\t"
-    "gslqc1     $f22, $f20, 0x0(%[pPix])                  \n\t"
-    "daddu      $9, %[iStride], %[iStride]                \n\t"
-    "daddu      $10, $9, %[iStride]                       \n\t"
-    "move       $12, $9                                   \n\t"
-    "dsubu      $8, %[pPix], $9                           \n\t"
-    "gslqc1     $f6, $f4, 0x0($8)                         \n\t"
-    "dsubu      $9, %[pPix], %[iStride]                   \n\t"
-    "gslqc1     $f18, $f16, 0x0($9)                       \n\t"
-    "daddu      $13, %[iStride], %[pPix]                  \n\t"
-
-    "move       %[iStride], $12                           \n\t"
-    "daddu      $15, $12, %[pPix]                         \n\t"
-
-    "daddu      $12, %[pPix], $10                         \n\t"
-    "dsubu      $11, %[pPix], $10                         \n\t"
-
-    "gslqc1     $f26, $f24, 0x0($11)                      \n\t"
-    "daddu      %[iStride], %[iStride], %[pPix]           \n\t"
-    "dmtc1      %[iAlpha], $f0                            \n\t"
-
-    "punpcklhw  $f28, $f0, $f0                            \n\t"
-    "punpcklwd  $f0, $f28, $f28                           \n\t"
-    "mov.d      $f2, $f0                                  \n\t"
-    "gssqc1     $f2, $f0, 640-320(%[tmp])                 \n\t"
-    "dmtc1      %[iBeta], $f0                             \n\t"
-    "gsldxc1    $f10, 0x0($15, $0)                        \n\t"
-    "punpcklhw  $f28, $f0, $f0                            \n\t"
-    "punpcklwd  $f0, $f28, $f28                           \n\t"
-    "punpckhbh  $f30, $f10, $f8                           \n\t"
-    "mov.d      $f2, $f0                                  \n\t"
-
-    "punpcklbh  $f28, $f10, $f8                           \n\t"
-    "gssqc1     $f2, $f0, 640-512(%[tmp])                 \n\t"
-    "gssqc1     $f30, $f28, 640-416(%[tmp])               \n\t"
-    "mov.d      $f0, $f4                                  \n\t"
-    "gssqc1     $f22, $f20, 704-272(%[tmp])               \n\t"
-    "gssqc1     $f6, $f4, 672-272(%[tmp])                 \n\t"
-    "mov.d      $f4, $f16                                 \n\t"
-    "punpckhbh  $f22, $f20, $f8                           \n\t"
-    "punpcklbh  $f20, $f20, $f8                           \n\t"
-    "punpckhbh  $f6, $f4, $f8                             \n\t"
-    "punpcklbh  $f4, $f4, $f8                             \n\t"
-
-    "psubh      $f28, $f20, $f4                           \n\t"
-    "psubh      $f30, $f22, $f6                           \n\t"
-    WELS_AbsH($f28, $f30, $f28, $f30, $f2, $f10)
-    "gssqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
-    "punpckhbh  $f2, $f0, $f8                             \n\t"
-    "punpcklbh  $f0, $f0, $f8                             \n\t"
-    "gssqc1     $f18, $f16, 688-272(%[tmp])               \n\t"
-    "gslqc1     $f18, $f16, 0x0($14)                      \n\t"
-    "gssqc1     $f2, $f0, 640-480(%[tmp])                 \n\t"
-
-    "psubh      $f28, $f4, $f0                            \n\t"
-    "psubh      $f30, $f6, $f2                            \n\t"
-
-    "gslqc1     $f2, $f0, 640-512(%[tmp])                 \n\t"
-    WELS_AbsH($f28, $f30, $f28, $f30, $f18, $f10)
-    "punpckhbh  $f18, $f16, $f8                           \n\t"
-    "punpcklbh  $f16, $f16, $f8                           \n\t"
-    "pcmpgth    $f0, $f0, $f28                            \n\t"
-    "pcmpgth    $f2, $f2, $f30                            \n\t"
-    "gssqc1     $f18, $f16, 640-384(%[tmp])               \n\t"
-    "psubh      $f28, $f20, $f16                          \n\t"
-    "psubh      $f30, $f22, $f18                          \n\t"
-    "gslqc1     $f18, $f16, 640-512(%[tmp])               \n\t"
-    "gssqc1     $f26, $f24, 656-272(%[tmp])               \n\t"
-    "punpckhbh  $f26, $f24, $f8                           \n\t"
-    "punpcklbh  $f24, $f24, $f8                           \n\t"
-    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
-    "gssqc1     $f26, $f24, 640-368(%[tmp])               \n\t"
-    "gssqc1     $f6, $f4, 640-144(%[tmp])                 \n\t"
-    "gssqc1     $f22, $f20, 640-400(%[tmp])               \n\t"
-    "pcmpgth    $f16, $f16, $f28                          \n\t"
-    "pcmpgth    $f18, $f18, $f30                          \n\t"
-    "and        $f0, $f0, $f16                            \n\t"
-    "and        $f2, $f2, $f18                            \n\t"
-    "gslqc1     $f18, $f16, 640-320(%[tmp])               \n\t"
-    "gslqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
-    "dli        %[iAlpha], 0x2                            \n\t"
-    "dli        %[iBeta], 0x2                             \n\t"
-    "pcmpgth    $f16, $f16, $f28                          \n\t"
-    "pcmpgth    $f18, $f18, $f30                          \n\t"
-    "and        $f0, $f0, $f16                            \n\t"
-    "and        $f2, $f2, $f18                            \n\t"
-    "dmtc1      %[iAlpha], $f16                           \n\t"
-    "dmtc1      %[iBeta], $f10                            \n\t"
-    "gssqc1     $f2, $f0, 736-272(%[tmp])                 \n\t"
-    "gslqc1     $f2, $f0, 640-320(%[tmp])                 \n\t"
-
-    "punpcklhw  $f28, $f16, $f16                          \n\t"
-    "psrah      $f16, $f0, $f10                           \n\t"
-    "psrah      $f18, $f2, $f10                           \n\t"
-    "punpcklwd  $f28, $f28, $f28                          \n\t"
-    "mov.d      $f30, $f28                                \n\t"
-    "gslqc1     $f10, $f8, 640-560(%[tmp])                \n\t"
-    "paddh      $f16, $f16, $f28                          \n\t"
-    "paddh      $f18, $f18, $f30                          \n\t"
-    "gssqc1     $f18, $f16, 640-576(%[tmp])               \n\t"
-    "pcmpgth    $f16, $f16, $f8                           \n\t"
-    "pcmpgth    $f18, $f18, $f10                          \n\t"
-    "gssqc1     $f18, $f16, 640-560(%[tmp])               \n\t"
-
-    "gssqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
-    "gslqc1     $f18, $f16, 640-512(%[tmp])               \n\t"
-    "psubh      $f28, $f4, $f24                           \n\t"
-    "psubh      $f30, $f6, $f26                           \n\t"
-    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
-    "gslqc1     $f10, $f8, 640-560(%[tmp])                \n\t"
-    "pcmpgth    $f16, $f16, $f28                          \n\t"
-    "pcmpgth    $f18, $f18, $f30                          \n\t"
-
-    "gslqc1     $f2, $f0, 640-416(%[tmp])                 \n\t"
-    "and        $f16, $f16, $f8                           \n\t"
-    "and        $f18, $f18, $f10                          \n\t"
-    "gssqc1     $f18, $f16, 640-544(%[tmp])               \n\t"
-    "gslqc1     $f18, $f16, 640-512(%[tmp])               \n\t"
-    "psubh      $f28, $f20, $f0                           \n\t"
-    "psubh      $f30, $f22, $f2                           \n\t"
-    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
-    "gslqc1     $f10, $f8, 640-560(%[tmp])                \n\t"
-    "pcmpgth    $f16, $f16, $f28                          \n\t"
-    "pcmpgth    $f18, $f18, $f30                          \n\t"
-
-    "and        $f16, $f16, $f8                           \n\t"
-    "and        $f18, $f18, $f10                          \n\t"
-    "gssqc1     $f18, $f16, 640-560(%[tmp])               \n\t"
-
-    "gslqc1     $f18, $f16, 640-544(%[tmp])               \n\t"
-    "xor        $f8, $f8, $f8                             \n\t"
-    "pandn      $f16, $f16, $f24                          \n\t"
-    "dli        %[iAlpha], 0x4                            \n\t"
-    "pandn      $f18, $f18, $f26                          \n\t"
-    "gssqc1     $f18, $f16, 640-16(%[tmp])                \n\t"
-    "dmtc1      %[iAlpha], $f16                           \n\t"
-    "punpcklhw  $f28, $f16, $f16                          \n\t"
-    "dli        %[iAlpha], 0x1                            \n\t"
-    "punpckhbh  $f18, $f12, $f8                           \n\t"
-    "dmtc1      %[iAlpha], $f30                           \n\t"
-    "punpcklbh  $f16, $f12, $f8                           \n\t"
-    "psllh      $f16, $f16, $f30                          \n\t"
-    "psllh      $f18, $f18, $f30                          \n\t"
-    "paddh      $f16, $f16, $f24                          \n\t"
-    "paddh      $f18, $f18, $f26                          \n\t"
-    "gslqc1     $f2, $f0, 640-480(%[tmp])                 \n\t"
-    "paddh      $f16, $f16, $f24                          \n\t"
-    "paddh      $f18, $f18, $f26                          \n\t"
-    "paddh      $f16, $f16, $f24                          \n\t"
-    "paddh      $f18, $f18, $f26                          \n\t"
-    "paddh      $f16, $f16, $f0                           \n\t"
-    "paddh      $f18, $f18, $f2                           \n\t"
-
-    "gslqc1     $f26, $f24, 640-560(%[tmp])               \n\t"
-    "punpcklwd  $f28, $f28, $f28                          \n\t"
-    "mov.d      $f30, $f28                                \n\t"
-    "paddh      $f16, $f16, $f4                           \n\t"
-    "paddh      $f18, $f18, $f6                           \n\t"
-    "gssqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
-    "paddh      $f16, $f16, $f20                          \n\t"
-    "paddh      $f18, $f18, $f22                          \n\t"
-    "paddh      $f16, $f16, $f28                          \n\t"
-    "paddh      $f18, $f18, $f30                          \n\t"
-    "gslqc1     $f30, $f28, 640-416(%[tmp])               \n\t"
-    "gslqc1     $f2, $f0, 640-384(%[tmp])                 \n\t"
-    "pandn      $f24, $f24, $f28                          \n\t"
-    "pandn      $f26, $f26, $f30                          \n\t"
-    "gssqc1     $f26, $f24, 640-80(%[tmp])                \n\t"
-    "gslqc1     $f26, $f24, 0x0($12)                      \n\t"
-    "dmtc1      %[iAlpha], $f10                           \n\t"
-    "punpckhbh  $f26, $f24, $f8                           \n\t"
-    "punpcklbh  $f24, $f24, $f8                           \n\t"
-    "psllh      $f24, $f24, $f10                          \n\t"
-    "psllh      $f26, $f26, $f10                          \n\t"
-    "paddh      $f24, $f24, $f28                          \n\t"
-    "paddh      $f26, $f26, $f30                          \n\t"
-    "paddh      $f24, $f24, $f28                          \n\t"
-    "paddh      $f26, $f26, $f30                          \n\t"
-    "paddh      $f24, $f24, $f28                          \n\t"
-    "paddh      $f26, $f26, $f30                          \n\t"
-    "paddh      $f24, $f24, $f0                           \n\t"
-    "paddh      $f26, $f26, $f2                           \n\t"
-
-    "dli        %[iAlpha], 0x3                            \n\t"
-    "gslqc1     $f30, $f28, 640-480(%[tmp])               \n\t"
-    "gslqc1     $f2, $f0, 640-592(%[tmp])                 \n\t"
-    "paddh      $f24, $f24, $f20                          \n\t"
-    "paddh      $f26, $f26, $f22                          \n\t"
-    "paddh      $f24, $f24, $f4                           \n\t"
-    "paddh      $f26, $f26, $f6                           \n\t"
-    "paddh      $f24, $f24, $f0                           \n\t"
-    "paddh      $f26, $f26, $f2                           \n\t"
-    "gslqc1     $f2, $f0, 640-560(%[tmp])                 \n\t"
-    "dmtc1      %[iAlpha], $f10                           \n\t"
-    "psrah      $f24, $f24, $f10                          \n\t"
-    "psrah      $f26, $f26, $f10                          \n\t"
-    "and        $f24, $f24, $f0                           \n\t"
-    "and        $f26, $f26, $f2                           \n\t"
-    "gssqc1     $f26, $f24, 640-112(%[tmp])               \n\t"
-    "gslqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
-    "pandn      $f24, $f24, $f28                          \n\t"
-    "pandn      $f26, $f26, $f30                          \n\t"
-    "gssqc1     $f26, $f24, 640-336(%[tmp])               \n\t"
-    "gslqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
-    "gssqc1     $f26, $f24, 640-528(%[tmp])               \n\t"
-    "gslqc1     $f26, $f24, 640-368(%[tmp])               \n\t"
-    "gslqc1     $f2, $f0, 640-544(%[tmp])                 \n\t"
-    "dmtc1      %[iAlpha], $f10                           \n\t"
-    "paddh      $f24, $f24, $f28                          \n\t"
-    "paddh      $f26, $f26, $f30                          \n\t"
-    "psrah      $f16, $f16, $f10                          \n\t"
-    "psrah      $f18, $f18, $f10                          \n\t"
-    "and        $f16, $f16, $f0                           \n\t"
-    "and        $f18, $f18, $f2                           \n\t"
-    "gslqc1     $f2, $f0, 640-624(%[tmp])                 \n\t"
-    "paddh      $f28, $f4, $f20                           \n\t"
-    "paddh      $f30, $f6, $f22                           \n\t"
-    "paddh      $f24, $f24, $f28                          \n\t"
-    "paddh      $f26, $f26, $f30                          \n\t"
-    "paddh      $f24, $f24, $f0                           \n\t"
-    "paddh      $f26, $f26, $f2                           \n\t"
-    "gslqc1     $f30, $f28, 640-528(%[tmp])               \n\t"
-    "dli        %[iAlpha], 0x2                            \n\t"
-
-    "dmtc1      %[iAlpha], $f10                           \n\t"
-    "paddh      $f20, $f20, $f4                           \n\t"
-    "paddh      $f22, $f22, $f6                           \n\t"
-    "psrah      $f24, $f24, $f10                          \n\t"
-    "psrah      $f26, $f26, $f10                          \n\t"
-    "and        $f28, $f28, $f24                          \n\t"
-    "and        $f30, $f30, $f26                          \n\t"
-
-    "gslqc1     $f26, $f24, 640-384(%[tmp])               \n\t"
-    "gssqc1     $f30, $f28, 640-64(%[tmp])                \n\t"
-    "gslqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
-    "pandn      $f28, $f28, $f24                          \n\t"
-    "pandn      $f30, $f30, $f26                          \n\t"
-    "gssqc1     $f30, $f28, 640-304(%[tmp])               \n\t"
-    "gslqc1     $f30, $f28, 640-416(%[tmp])               \n\t"
-    "gslqc1     $f10, $f8, 640-624(%[tmp])                \n\t"
-    "paddh      $f28, $f28, $f24                          \n\t"
-    "paddh      $f30, $f30, $f26                          \n\t"
-    "paddh      $f28, $f28, $f20                          \n\t"
-    "paddh      $f30, $f30, $f22                          \n\t"
-    "paddh      $f28, $f28, $f8                           \n\t"
-    "paddh      $f30, $f30, $f10                          \n\t"
-    "dmtc1      %[iAlpha], $f10                           \n\t"
-    "gslqc1     $f22, $f20, 640-560(%[tmp])               \n\t"
-    "psrah      $f28, $f28, $f10                          \n\t"
-    "psrah      $f30, $f30, $f10                          \n\t"
-    "and        $f20, $f20, $f28                          \n\t"
-    "and        $f22, $f22, $f30                          \n\t"
-    "gssqc1     $f22, $f20, 640-32(%[tmp])                \n\t"
-
-    "gslqc1     $f22, $f20, 640-480(%[tmp])               \n\t"
-    "gslqc1     $f2, $f0, 640-592(%[tmp])                 \n\t"
-    "gslqc1     $f10, $f8, 640-624(%[tmp])                \n\t"
-    "paddh      $f28, $f20, $f20                          \n\t"
-    "paddh      $f30, $f22, $f22                          \n\t"
-    "paddh      $f20, $f4, $f24                           \n\t"
-    "paddh      $f22, $f6, $f26                           \n\t"
-    "paddh      $f24, $f24, $f0                           \n\t"
-    "paddh      $f26, $f26, $f2                           \n\t"
-    "paddh      $f28, $f28, $f20                          \n\t"
-    "paddh      $f30, $f30, $f22                          \n\t"
-    "paddh      $f28, $f28, $f8                           \n\t"
-    "paddh      $f30, $f30, $f10                          \n\t"
-    "dmtc1      %[iAlpha], $f10                           \n\t"
-    "gslqc1     $f22, $f20, 640-544(%[tmp])               \n\t"
-    "psrah      $f28, $f28, $f10                          \n\t"
-    "psrah      $f30, $f30, $f10                          \n\t"
-    "dli        %[iAlpha], 0x1                            \n\t"
-    "pandn      $f20, $f20, $f28                          \n\t"
-    "pandn      $f22, $f22, $f30                          \n\t"
-    "gslqc1     $f30, $f28, 640-480(%[tmp])               \n\t"
-    "paddh      $f28, $f28, $f4                           \n\t"
-    "paddh      $f30, $f30, $f6                           \n\t"
-    "gslqc1     $f6, $f4, 640-400(%[tmp])                 \n\t"
-    "paddh      $f28, $f28, $f4                           \n\t"
-    "paddh      $f30, $f30, $f6                           \n\t"
-    "gslqc1     $f6, $f4, 640-544(%[tmp])                 \n\t"
-    "dmtc1      %[iAlpha], $f10                           \n\t"
-    "gssqc1     $f22, $f20, 640-352(%[tmp])               \n\t"
-    "gslqc1     $f22, $f20, 640-368(%[tmp])               \n\t"
-    "psllh      $f28, $f28, $f10                          \n\t"
-    "psllh      $f30, $f30, $f10                          \n\t"
-    "dli        %[iAlpha], 0x3                            \n\t"
-    "paddh      $f28, $f28, $f24                          \n\t"
-    "paddh      $f30, $f30, $f26                          \n\t"
-    "paddh      $f20, $f20, $f28                          \n\t"
-    "paddh      $f22, $f22, $f30                          \n\t"
-    "dmtc1      %[iAlpha], $f10                           \n\t"
-
-    "dli        %[iAlpha], 0x2                            \n\t"
-    "gslqc1     $f30, $f28, 640-400(%[tmp])               \n\t"
-    "psrah      $f20, $f20, $f10                          \n\t"
-    "psrah      $f22, $f22, $f10                          \n\t"
-    "and        $f4, $f4, $f20                            \n\t"
-    "and        $f6, $f6, $f22                            \n\t"
-    "gslqc1     $f22, $f20, 640-480(%[tmp])               \n\t"
-    "gssqc1     $f6, $f4, 640-96(%[tmp])                  \n\t"
-    "gslqc1     $f6, $f4, 640-384(%[tmp])                 \n\t"
-    "gslqc1     $f10, $f8, 640-400(%[tmp])                \n\t"
-    "paddh      $f24, $f4, $f4                            \n\t"
-    "paddh      $f26, $f6, $f6                            \n\t"
-    "paddh      $f4, $f4, $f8                             \n\t"
-    "paddh      $f6, $f6, $f10                            \n\t"
-    "gslqc1     $f10, $f8, 640-144(%[tmp])                \n\t"
-    "paddh      $f28, $f28, $f20                          \n\t"
-    "paddh      $f30, $f30, $f22                          \n\t"
-    "paddh      $f4, $f4, $f8                             \n\t"
-    "paddh      $f6, $f6, $f10                            \n\t"
-    "gslqc1     $f10, $f8, 640-592(%[tmp])                \n\t"
-    "paddh      $f24, $f24, $f28                          \n\t"
-    "paddh      $f26, $f26, $f30                          \n\t"
-    "paddh      $f20, $f20, $f8                           \n\t"
-    "paddh      $f22, $f22, $f10                          \n\t"
-    "gslqc1     $f10, $f8, 640-624(%[tmp])                \n\t"
-    "paddh      $f24, $f24, $f8                           \n\t"
-    "dmtc1      %[iAlpha], $f8                            \n\t"
-    "paddh      $f26, $f26, $f10                          \n\t"
-    "dli        %[iAlpha], 0x1                            \n\t"
-    "gslqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
-    "dmtc1      %[iAlpha], $f10                           \n\t"
-    "psrah      $f24, $f24, $f8                           \n\t"
-    "psrah      $f26, $f26, $f8                           \n\t"
-    "psllh      $f4, $f4, $f10                            \n\t"
-    "psllh      $f6, $f6, $f10                            \n\t"
-    "paddh      $f4, $f4, $f20                            \n\t"
-    "paddh      $f6, $f6, $f22                            \n\t"
-    "dli        %[iAlpha], 0x3                            \n\t"
-
-    "gslqc1     $f22, $f20, 656-272(%[tmp])               \n\t"
-    "pandn      $f28, $f28, $f24                          \n\t"
-    "pandn      $f30, $f30, $f26                          \n\t"
-    "gslqc1     $f26, $f24, 640-416(%[tmp])               \n\t"
-    "dmtc1      %[iAlpha], $f10                           \n\t"
-    "paddh      $f24, $f24, $f4                           \n\t"
-    "paddh      $f26, $f26, $f6                           \n\t"
-    "gslqc1     $f6, $f4, 640-560(%[tmp])                 \n\t"
-    "psrah      $f24, $f24, $f10                          \n\t"
-    "psrah      $f26, $f26, $f10                          \n\t"
-    "and        $f4, $f4, $f24                            \n\t"
-    "and        $f6, $f6, $f26                            \n\t"
-
-    "xor        $f8, $f8, $f8                             \n\t"
-    "gslqc1     $f26, $f24, 704-272(%[tmp])               \n\t"
-    "gssqc1     $f6, $f4, 640-128(%[tmp])                 \n\t"
-    "gslqc1     $f6, $f4, 672-272(%[tmp])                 \n\t"
-    "punpcklbh  $f4, $f6, $f8                             \n\t"
-    "punpckhbh  $f6, $f6, $f8                             \n\t"
-    "gssqc1     $f6, $f4, 640-448(%[tmp])                 \n\t"
-    "gslqc1     $f6, $f4, 688-272(%[tmp])                 \n\t"
-    "punpcklbh  $f4, $f6, $f8                             \n\t"
-    "punpckhbh  $f6, $f6, $f8                             \n\t"
-    "punpcklbh  $f24, $f26, $f8                           \n\t"
-    "punpckhbh  $f26, $f26, $f8                           \n\t"
-    "gssqc1     $f30, $f28, 640-288(%[tmp])               \n\t"
-    "punpcklbh  $f20, $f22, $f8                           \n\t"
-    "punpckhbh  $f22, $f22, $f8                           \n\t"
-    "gslqc1     $f30, $f28, 0x0($14)                      \n\t"
-    "gssqc1     $f6, $f4, 640-496(%[tmp])                 \n\t"
-    "gssqc1     $f26, $f24, 640-432(%[tmp])               \n\t"
-
-    "gsldxc1    $f0, 0x8($15, $0)                         \n\t"
-    "punpcklbh  $f28, $f30, $f8                           \n\t"
-    "punpckhbh  $f30, $f30, $f8                           \n\t"
-    "gssqc1     $f30, $f28, 640-464(%[tmp])               \n\t"
-
-    "punpcklbh  $f28, $f0, $f8                            \n\t"
-    "punpckhbh  $f30, $f0, $f8                            \n\t"
-    "gslqc1     $f10, $f8, 640-464(%[tmp])                \n\t"
-    "gssqc1     $f30, $f28, 640-528(%[tmp])               \n\t"
-
-    "psubh      $f28, $f24, $f4                           \n\t"
-    "psubh      $f30, $f26, $f6                           \n\t"
-    "psubh      $f24, $f24, $f8                           \n\t"
-    "psubh      $f26, $f26, $f10                          \n\t"
-    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
-    "gslqc1     $f10, $f8, 640-16(%[tmp])                 \n\t"
-    "gssqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
-    "or         $f16, $f16, $f8                           \n\t"
-    "or         $f18, $f18, $f10                          \n\t"
-    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
-    "gslqc1     $f30, $f28, 640-448(%[tmp])               \n\t"
-    "psubh      $f28, $f4, $f28                           \n\t"
-    "psubh      $f30, $f6, $f30                           \n\t"
-
-    "gslqc1     $f2, $f0, 640-512(%[tmp])                 \n\t"
-    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
-    "pcmpgth    $f4, $f0, $f28                            \n\t"
-    "pcmpgth    $f6, $f2, $f30                            \n\t"
-    "pcmpgth    $f28, $f0, $f24                           \n\t"
-    "pcmpgth    $f30, $f2, $f26                           \n\t"
-    "gslqc1     $f26, $f24, 640-320(%[tmp])               \n\t"
-    "and        $f4, $f4, $f28                            \n\t"
-    "and        $f6, $f6, $f30                            \n\t"
-    "gslqc1     $f30, $f28, 640-560(%[tmp])               \n\t"
-    "pcmpgth    $f24, $f24, $f28                          \n\t"
-    "pcmpgth    $f26, $f26, $f30                          \n\t"
-    "and        $f4, $f4, $f24                            \n\t"
-    "and        $f6, $f6, $f26                            \n\t"
-
-    "gslqc1     $f26, $f24, 640-576(%[tmp])               \n\t"
-    "pcmpgth    $f24, $f24, $f28                          \n\t"
-    "pcmpgth    $f26, $f26, $f30                          \n\t"
-    "xor        $f8, $f8, $f8                             \n\t"
-    "gslqc1     $f30, $f28, 640-496(%[tmp])               \n\t"
-    "punpcklbh  $f12, $f14, $f8                           \n\t"
-    "punpckhbh  $f14, $f14, $f8                           \n\t"
-    "gssqc1     $f26, $f24, 640-560(%[tmp])               \n\t"
-    "gslqc1     $f26, $f24, 640-512(%[tmp])               \n\t"
-    "psubh      $f28, $f28, $f20                          \n\t"
-    "psubh      $f30, $f30, $f22                          \n\t"
-    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
-    "pcmpgth    $f24, $f24, $f28                          \n\t"
-    "pcmpgth    $f26, $f26, $f30                          \n\t"
-
-    "dli        %[iAlpha], 0x1                            \n\t"
-    "gslqc1     $f10, $f8, 640-560(%[tmp])                \n\t"
-    "and        $f24, $f24, $f8                           \n\t"
-    "and        $f26, $f26, $f10                          \n\t"
-    "gslqc1     $f30, $f28, 640-432(%[tmp])               \n\t"
-    "gslqc1     $f10, $f8, 640-528(%[tmp])                \n\t"
-    "psubh      $f28, $f28, $f8                           \n\t"
-    "psubh      $f30, $f30, $f10                          \n\t"
-    "dmtc1      %[iAlpha], $f10                           \n\t"
-
-    "psllh      $f12, $f12, $f10                          \n\t"
-    "psllh      $f14, $f14, $f10                          \n\t"
-    "gssqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
-    "gslqc1     $f26, $f24, 640-512(%[tmp])               \n\t"
-
-    "gslqc1     $f10, $f8, 640-448(%[tmp])                \n\t"
-    "paddh      $f12, $f12, $f20                          \n\t"
-    "paddh      $f14, $f14, $f22                          \n\t"
-    "paddh      $f12, $f12, $f20                          \n\t"
-    "paddh      $f14, $f14, $f22                          \n\t"
-    "paddh      $f12, $f12, $f20                          \n\t"
-    "paddh      $f14, $f14, $f22                          \n\t"
-    "paddh      $f12, $f12, $f8                           \n\t"
-    "paddh      $f14, $f14, $f10                          \n\t"
-    "gslqc1     $f10, $f8, 640-496(%[tmp])                \n\t"
-    "gslqc1     $f2, $f0, 640-560(%[tmp])                 \n\t"
-    "paddh      $f12, $f12, $f8                           \n\t"
-    "paddh      $f14, $f14, $f10                          \n\t"
-    WELS_AbsH($f28, $f30, $f28, $f30, $f8, $f10)
-    "pcmpgth    $f24, $f24, $f28                          \n\t"
-    "pcmpgth    $f26, $f26, $f30                          \n\t"
-    "and        $f24, $f24, $f0                           \n\t"
-    "and        $f26, $f26, $f2                           \n\t"
-    "gssqc1     $f26, $f24, 640-560(%[tmp])               \n\t"
-    "gslqc1     $f10, $f8, 640-544(%[tmp])                \n\t"
-
-    "gslqc1     $f2, $f0, 736-272(%[tmp])                 \n\t"
-    "dli        %[iAlpha], 0x3                            \n\t"
-    "gslqc1     $f30, $f28, 640-368(%[tmp])               \n\t"
-    "and        $f24, $f0, $f16                           \n\t"
-    "and        $f26, $f2, $f18                           \n\t"
-    "pandn      $f16, $f0, $f28                           \n\t"
-    "pandn      $f18, $f2, $f30                           \n\t"
-    "or         $f24, $f24, $f16                          \n\t"
-    "or         $f26, $f26, $f18                          \n\t"
-    "gslqc1     $f18, $f16, 640-432(%[tmp])               \n\t"
-    "paddh      $f12, $f12, $f16                          \n\t"
-    "paddh      $f14, $f14, $f18                          \n\t"
-    "gslqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
-    "paddh      $f12, $f12, $f28                          \n\t"
-    "paddh      $f14, $f14, $f30                          \n\t"
-    "dmtc1      %[iAlpha], $f28                           \n\t"
-    "psrah      $f12, $f12, $f28                          \n\t"
-    "psrah      $f14, $f14, $f28                          \n\t"
-    "and        $f12, $f12, $f8                           \n\t"
-    "and        $f14, $f14, $f10                          \n\t"
-    "pandn      $f8, $f8, $f20                            \n\t"
-    "pandn      $f10, $f10, $f22                          \n\t"
-    "or         $f12, $f12, $f8                           \n\t"
-    "or         $f14, $f14, $f10                          \n\t"
-    "and        $f28, $f4, $f12                           \n\t"
-    "and        $f30, $f6, $f14                           \n\t"
-    "gslqc1     $f14, $f12, 640-64(%[tmp])                \n\t"
-    "gslqc1     $f10, $f8, 640-336(%[tmp])                \n\t"
-    "or         $f12, $f12, $f8                           \n\t"
-    "or         $f14, $f14, $f10                          \n\t"
-    "pandn      $f8, $f4, $f20                            \n\t"
-    "pandn      $f10, $f6, $f22                           \n\t"
-    "or         $f28, $f28, $f8                           \n\t"
-    "or         $f30, $f30, $f10                          \n\t"
-
-    "dli        %[iAlpha], 0x2                            \n\t"
-    "and        $f8, $f0, $f12                            \n\t"
-    "and        $f10, $f2, $f14                           \n\t"
-    "gslqc1     $f14, $f12, 640-480(%[tmp])               \n\t"
-    "pandn      $f12, $f0, $f12                           \n\t"
-    "pandn      $f14, $f2, $f14                           \n\t"
-    "or         $f8, $f8, $f12                            \n\t"
-    "or         $f10, $f10, $f14                          \n\t"
-    "packushb   $f24, $f24, $f26                          \n\t"
-    "packushb   $f26, $f28, $f30                          \n\t"
-    "gssqc1     $f10, $f8, 640-336(%[tmp])                \n\t"
-    "gssqc1     $f26, $f24, 656-272(%[tmp])               \n\t"
-    "gslqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
-    "gslqc1     $f10, $f8, 640-448(%[tmp])                \n\t"
-    "paddh      $f8, $f20, $f8                            \n\t"
-    "paddh      $f10, $f22, $f10                          \n\t"
-    "gslqc1     $f30, $f28, 640-496(%[tmp])               \n\t"
-    "paddh      $f28, $f28, $f16                          \n\t"
-    "paddh      $f30, $f30, $f18                          \n\t"
-    "paddh      $f8, $f8, $f28                            \n\t"
-    "paddh      $f10, $f10, $f30                          \n\t"
-    "gslqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
-    "paddh      $f8, $f8, $f28                            \n\t"
-    "paddh      $f10, $f10, $f30                          \n\t"
-    "dmtc1      %[iAlpha], $f28                           \n\t"
-    "psrah      $f8, $f8, $f28                            \n\t"
-    "psrah      $f10, $f10, $f28                          \n\t"
-    "dli        %[iAlpha], 0x1                            \n\t"
-    "gslqc1     $f30, $f28, 640-544(%[tmp])               \n\t"
-    "and        $f24, $f24, $f8                           \n\t"
-    "and        $f26, $f26, $f10                          \n\t"
-    "gslqc1     $f10, $f8, 640-448(%[tmp])                \n\t"
-    "pandn      $f28, $f28, $f8                           \n\t"
-    "pandn      $f30, $f30, $f10                          \n\t"
-    "or         $f24, $f24, $f28                          \n\t"
-    "or         $f26, $f26, $f30                          \n\t"
-    "and        $f12, $f4, $f24                           \n\t"
-    "and        $f14, $f6, $f26                           \n\t"
-    "pandn      $f24, $f4, $f8                            \n\t"
-    "pandn      $f26, $f6, $f10                           \n\t"
-    "gslqc1     $f30, $f28, 640-496(%[tmp])               \n\t"
-    "paddh      $f8, $f8, $f28                            \n\t"
-    "paddh      $f10, $f10, $f30                          \n\t"
-    "paddh      $f8, $f8, $f16                            \n\t"
-    "paddh      $f10, $f10, $f18                          \n\t"
-    "or         $f12, $f12, $f24                          \n\t"
-    "or         $f14, $f14, $f26                          \n\t"
-    "gslqc1     $f26, $f24, 640-336(%[tmp])               \n\t"
-    "dmtc1      %[iAlpha], $f28                           \n\t"
-    "packushb   $f24, $f24, $f26                          \n\t"
-    "packushb   $f26, $f12, $f14                          \n\t"
-    "psllh      $f8, $f8, $f28                            \n\t"
-    "psllh      $f10, $f10, $f28                          \n\t"
-    "gssqc1     $f26, $f24, 672-272(%[tmp])               \n\t"
-    "gslqc1     $f26, $f24, 640-96(%[tmp])                \n\t"
-    "gslqc1     $f30, $f28, 640-352(%[tmp])               \n\t"
-    "or         $f24, $f24, $f28                          \n\t"
-    "or         $f26, $f26, $f30                          \n\t"
-    "dli        %[iAlpha], 0x3                            \n\t"
-
-    "and        $f12, $f0, $f24                           \n\t"
-    "and        $f14, $f2, $f26                           \n\t"
-    "gslqc1     $f26, $f24, 640-144(%[tmp])               \n\t"
-    "pandn      $f24, $f0, $f24                           \n\t"
-    "pandn      $f26, $f2, $f26                           \n\t"
-    "or         $f12, $f12, $f24                          \n\t"
-    "or         $f14, $f14, $f26                          \n\t"
-    "gslqc1     $f26, $f24, 640-544(%[tmp])               \n\t"
-    "gssqc1     $f14, $f12, 640-352(%[tmp])               \n\t"
-    "gslqc1     $f14, $f12, 640-464(%[tmp])               \n\t"
-    "gslqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
-    "paddh      $f12, $f12, $f28                          \n\t"
-    "paddh      $f14, $f14, $f30                          \n\t"
-    "paddh      $f8, $f8, $f12                            \n\t"
-    "paddh      $f10, $f10, $f14                          \n\t"
-    "gslqc1     $f14, $f12, 640-448(%[tmp])               \n\t"
-    "paddh      $f20, $f20, $f8                           \n\t"
-    "paddh      $f22, $f22, $f10                          \n\t"
-    "dmtc1      %[iAlpha], $f28                           \n\t"
-    "gslqc1     $f10, $f8, 640-496(%[tmp])                \n\t"
-    "psrah      $f20, $f20, $f28                          \n\t"
-    "psrah      $f22, $f22, $f28                          \n\t"
-    "and        $f24, $f24, $f20                          \n\t"
-    "and        $f26, $f26, $f22                          \n\t"
-    "gslqc1     $f22, $f20, 640-464(%[tmp])               \n\t"
-    "paddh      $f8, $f8, $f20                            \n\t"
-    "paddh      $f10, $f10, $f22                          \n\t"
-    "gslqc1     $f30, $f28, 640-432(%[tmp])               \n\t"
-    "dli        %[iAlpha], 0x2                            \n\t"
-    "paddh      $f20, $f20, $f28                          \n\t"
-    "paddh      $f22, $f22, $f30                          \n\t"
-    "paddh      $f16, $f12, $f12                          \n\t"
-    "paddh      $f18, $f14, $f14                          \n\t"
-    "paddh      $f16, $f16, $f8                           \n\t"
-    "paddh      $f18, $f18, $f10                          \n\t"
-    "gslqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
-    "paddh      $f16, $f16, $f28                          \n\t"
-    "paddh      $f18, $f18, $f30                          \n\t"
-    "gslqc1     $f10, $f8, 640-544(%[tmp])                \n\t"
-    "gslqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
-    "paddh      $f12, $f12, $f28                          \n\t"
-    "paddh      $f14, $f14, $f30                          \n\t"
-    "dmtc1      %[iAlpha], $f28                           \n\t"
-    "psrah      $f16, $f16, $f28                          \n\t"
-    "psrah      $f18, $f18, $f28                          \n\t"
-    "pandn      $f8, $f8, $f16                            \n\t"
-    "pandn      $f10, $f10, $f18                          \n\t"
-    "or         $f24, $f24, $f8                           \n\t"
-    "or         $f26, $f26, $f10                          \n\t"
-    "and        $f28, $f4, $f24                           \n\t"
-    "and        $f30, $f6, $f26                           \n\t"
-    "gslqc1     $f26, $f24, 640-496(%[tmp])               \n\t"
-    "pandn      $f8, $f4, $f24                            \n\t"
-    "pandn      $f10, $f6, $f26                           \n\t"
-    "or         $f28, $f28, $f8                           \n\t"
-    "or         $f30, $f30, $f10                          \n\t"
-    "gslqc1     $f10, $f8, 640-352(%[tmp])                \n\t"
-    "packushb   $f8, $f8, $f10                            \n\t"
-    "packushb   $f10, $f28, $f30                          \n\t"
-    "gssqc1     $f10, $f8, 688-272(%[tmp])                \n\t"
-    "gslqc1     $f10, $f8, 640-128(%[tmp])                \n\t"
-    "gslqc1     $f30, $f28, 640-288(%[tmp])               \n\t"
-    "or         $f8, $f8, $f28                            \n\t"
-    "or         $f10, $f10, $f30                          \n\t"
-    "dli        %[iAlpha], 0x1                            \n\t"
-
-    "and        $f16, $f0, $f8                            \n\t"
-    "and        $f18, $f2, $f10                           \n\t"
-    "paddh      $f20, $f20, $f24                          \n\t"
-    "paddh      $f22, $f22, $f26                          \n\t"
-    "gslqc1     $f30, $f28, 640-400(%[tmp])               \n\t"
-    "pandn      $f8, $f0, $f28                            \n\t"
-    "pandn      $f10, $f2, $f30                           \n\t"
-    "or         $f16, $f16, $f8                           \n\t"
-    "or         $f18, $f18, $f10                          \n\t"
-    "dmtc1      %[iAlpha], $f28                           \n\t"
-    "gslqc1     $f10, $f8, 640-528(%[tmp])                \n\t"
-    "dli        %[iAlpha], 0x3                            \n\t"
-    "psllh      $f20, $f20, $f28                          \n\t"
-    "psllh      $f22, $f22, $f28                          \n\t"
-    "paddh      $f20, $f20, $f12                          \n\t"
-    "paddh      $f22, $f22, $f14                          \n\t"
-    "dmtc1      %[iAlpha], $f28                           \n\t"
-    "gslqc1     $f14, $f12, 640-560(%[tmp])               \n\t"
-    "paddh      $f8, $f8, $f20                            \n\t"
-    "paddh      $f10, $f10, $f22                          \n\t"
-    "psrah      $f8, $f8, $f28                            \n\t"
-    "psrah      $f10, $f10, $f28                          \n\t"
-    "gssqc1     $f18, $f16, 640-288(%[tmp])               \n\t"
-    "gslqc1     $f18, $f16, 640-560(%[tmp])               \n\t"
-    "and        $f16, $f16, $f8                           \n\t"
-    "and        $f18, $f18, $f10                          \n\t"
-    "gslqc1     $f10, $f8, 640-464(%[tmp])                \n\t"
-    "paddh      $f20, $f8, $f8                            \n\t"
-    "paddh      $f22, $f10, $f10                          \n\t"
-    "gslqc1     $f10, $f8, 640-432(%[tmp])                \n\t"
-    "gslqc1     $f30, $f28, 640-448(%[tmp])               \n\t"
-    "paddh      $f8, $f8, $f28                            \n\t"
-    "paddh      $f10, $f10, $f30                          \n\t"
-    "dli        %[iAlpha], 0x2                            \n\t"
-    "paddh      $f20, $f20, $f8                           \n\t"
-    "paddh      $f22, $f22, $f10                          \n\t"
-    "gslqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
-    "paddh      $f20, $f20, $f28                          \n\t"
-    "paddh      $f22, $f22, $f30                          \n\t"
-    "dmtc1      %[iAlpha], $f28                           \n\t"
-    "gslqc1     $f26, $f24, 640-560(%[tmp])               \n\t"
-    "psrah      $f20, $f20, $f28                          \n\t"
-    "psrah      $f22, $f22, $f28                          \n\t"
-    "pandn      $f12, $f12, $f20                          \n\t"
-    "pandn      $f14, $f14, $f22                          \n\t"
-    "or         $f16, $f16, $f12                          \n\t"
-    "or         $f18, $f18, $f14                          \n\t"
-    "gslqc1     $f14, $f12, 640-32(%[tmp])                \n\t"
-    "gslqc1     $f30, $f28, 640-304(%[tmp])               \n\t"
-    "or         $f12, $f12, $f28                          \n\t"
-    "or         $f14, $f14, $f30                          \n\t"
-    "and        $f28, $f4, $f16                           \n\t"
-    "and        $f30, $f6, $f18                           \n\t"
-    "gslqc1     $f18, $f16, 640-432(%[tmp])               \n\t"
-    "gslqc1     $f22, $f20, 640-464(%[tmp])               \n\t"
-    "pandn      $f8, $f4, $f16                            \n\t"
-    "pandn      $f10, $f6, $f18                           \n\t"
-    "or         $f28, $f28, $f8                           \n\t"
-    "or         $f30, $f30, $f10                          \n\t"
-    "gslqc1     $f10, $f8, 640-496(%[tmp])                \n\t"
-    "paddh      $f16, $f16, $f8                           \n\t"
-    "paddh      $f18, $f18, $f10                          \n\t"
-    "gslqc1     $f10, $f8, 640-288(%[tmp])                \n\t"
-    "packushb   $f8, $f8, $f10                            \n\t"
-    "packushb   $f10, $f28, $f30                          \n\t"
-    "dli        %[iAlpha], 0x2                            \n\t"
-    "gssqc1     $f10, $f8, 704-272(%[tmp])                \n\t"
-
-    "and        $f8, $f0, $f12                            \n\t"
-    "and        $f10, $f2, $f14                           \n\t"
-    "gslqc1     $f30, $f28, 640-384(%[tmp])               \n\t"
-    "pandn      $f12, $f0, $f28                           \n\t"
-    "pandn      $f14, $f2, $f30                           \n\t"
-    "or         $f8, $f8, $f12                            \n\t"
-    "or         $f10, $f10, $f14                          \n\t"
-    "gssqc1     $f10, $f8, 640-304(%[tmp])                \n\t"
-    "gslqc1     $f10, $f8, 640-528(%[tmp])                \n\t"
-    "gslqc1     $f30, $f28, 640-464(%[tmp])               \n\t"
-    "paddh      $f12, $f8, $f28                           \n\t"
-    "paddh      $f14, $f10, $f30                          \n\t"
-    "paddh      $f12, $f12, $f16                          \n\t"
-    "paddh      $f14, $f14, $f18                          \n\t"
-    "gslqc1     $f30, $f28, 640-624(%[tmp])               \n\t"
-    "paddh      $f12, $f12, $f28                          \n\t"
-    "paddh      $f14, $f14, $f30                          \n\t"
-    "dmtc1      %[iAlpha], $f28                           \n\t"
-    "psrah      $f12, $f12, $f28                          \n\t"
-    "psrah      $f14, $f14, $f28                          \n\t"
-    "and        $f24, $f24, $f12                          \n\t"
-    "and        $f26, $f26, $f14                          \n\t"
-    "gslqc1     $f14, $f12, 640-560(%[tmp])               \n\t"
-    "pandn      $f16, $f12, $f20                          \n\t"
-    "pandn      $f18, $f14, $f22                          \n\t"
-    "or         $f24, $f24, $f16                          \n\t"
-    "or         $f26, $f26, $f18                          \n\t"
-    "and        $f28, $f4, $f24                           \n\t"
-    "and        $f30, $f6, $f26                           \n\t"
-    "gslqc1     $f26, $f24, 640-304(%[tmp])               \n\t"
-    "pandn      $f16, $f4, $f20                           \n\t"
-    "pandn      $f18, $f6, $f22                           \n\t"
-    "or         $f28, $f28, $f16                          \n\t"
-    "or         $f30, $f30, $f18                          \n\t"
-    "dli        %[iAlpha], 0x1                            \n\t"
-
-    "packushb   $f24, $f24, $f26                          \n\t"
-    "packushb   $f26, $f28, $f30                          \n\t"
-    "gslqc1     $f30, $f28, 640-112(%[tmp])               \n\t"
-    "gslqc1     $f18, $f16, 640-80(%[tmp])                \n\t"
-    "or         $f28, $f28, $f16                          \n\t"
-    "or         $f30, $f30, $f18                          \n\t"
-    "and        $f16, $f0, $f28                           \n\t"
-    "and        $f18, $f2, $f30                           \n\t"
-    "gslqc1     $f30, $f28, 640-416(%[tmp])               \n\t"
-    "pandn      $f0, $f0, $f28                            \n\t"
-    "pandn      $f2, $f2, $f30                            \n\t"
-    "or         $f16, $f16, $f0                           \n\t"
-    "or         $f18, $f18, $f2                           \n\t"
-    "xor        $f28, $f28, $f28                          \n\t"
-    "xor        $f30, $f30, $f30                          \n\t"
-    "gslqc1     $f2, $f0, 0x0($12)                        \n\t"
-    "dmtc1      %[iAlpha], $f28                           \n\t"
-    "punpcklbh  $f0, $f2, $f30                            \n\t"
-    "punpckhbh  $f2, $f2, $f30                            \n\t"
-    "psllh      $f0, $f0, $f28                            \n\t"
-    "psllh      $f2, $f2, $f28                            \n\t"
-    "paddh      $f0, $f0, $f8                             \n\t"
-    "paddh      $f2, $f2, $f10                            \n\t"
-    "paddh      $f0, $f0, $f8                             \n\t"
-    "paddh      $f2, $f2, $f10                            \n\t"
-    "paddh      $f0, $f0, $f8                             \n\t"
-    "paddh      $f2, $f2, $f10                            \n\t"
-    "paddh      $f0, $f0, $f20                            \n\t"
-    "paddh      $f2, $f2, $f22                            \n\t"
-    "dli        %[iAlpha], 0x3                            \n\t"
-    "gslqc1     $f30, $f28, 640-432(%[tmp])               \n\t"
-    "paddh      $f0, $f0, $f28                            \n\t"
-    "paddh      $f2, $f2, $f30                            \n\t"
-    "gslqc1     $f30, $f28, 640-496(%[tmp])               \n\t"
-    "paddh      $f0, $f0, $f28                            \n\t"
-    "paddh      $f2, $f2, $f30                            \n\t"
-    "gslqc1     $f30, $f28, 640-592(%[tmp])               \n\t"
-    "paddh      $f0, $f0, $f28                            \n\t"
-    "paddh      $f2, $f2, $f30                            \n\t"
-    "dmtc1      %[iAlpha], $f28                           \n\t"
-    "psrah      $f0, $f0, $f28                            \n\t"
-    "psrah      $f2, $f2, $f28                            \n\t"
-    "and        $f0, $f0, $f12                            \n\t"
-    "and        $f2, $f2, $f14                            \n\t"
-    "pandn      $f12, $f12, $f8                           \n\t"
-    "pandn      $f14, $f14, $f10                          \n\t"
-    "or         $f0, $f0, $f12                            \n\t"
-    "or         $f2, $f2, $f14                            \n\t"
-    "and        $f28, $f4, $f0                            \n\t"
-    "and        $f30, $f6, $f2                            \n\t"
-
-    "gslqc1     $f2, $f0, 656-272(%[tmp])                 \n\t"
-    "gssqc1     $f2, $f0, 0x0($11)                        \n\t"
-
-    "gslqc1     $f2, $f0, 672-272(%[tmp])                 \n\t"
-
-    "gssqc1     $f2, $f0, 0x0($8)                         \n\t"
-    "gslqc1     $f2, $f0, 688-272(%[tmp])                 \n\t"
-    "gssqc1     $f2, $f0, 0x0($9)                         \n\t"
-    "gslqc1     $f2, $f0, 704-272(%[tmp])                 \n\t"
-
-    "pandn      $f4, $f4, $f8                             \n\t"
-    "pandn      $f6, $f6, $f10                            \n\t"
-    "gssqc1     $f2, $f0, 0x0(%[pPix])                    \n\t"
-    "or         $f28, $f28, $f4                           \n\t"
-    "or         $f30, $f30, $f6                           \n\t"
-    "packushb   $f16, $f16, $f18                          \n\t"
-    "packushb   $f18, $f28, $f30                          \n\t"
-    "gssqc1     $f26, $f24, 0x0($13)                      \n\t"
-    "gssqc1     $f18, $f16, 0x0(%[iStride])               \n\t"
-    : [pPix]"+&r"((unsigned char *)pPix)
-    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
-      [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
-    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0",
-      "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
-      "$f22", "$f24", "$f26", "$f28", "$f30"
-  );
-  RECOVER_REG;
-}
-
-void DeblockChromaLt4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
-                           int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
-  unsigned char tmp[256] __attribute__((aligned(32)));
-  BACKUP_REG;
-  __asm__ volatile (
-    ".set       arch=loongson3a                           \n\t"
-    "lb         $8, 0x2(%[pTC])                           \n\t"
-    "lb         $9, 0x3(%[pTC])                           \n\t"
-    "move       $11, $8                                   \n\t"
-    "lb         $8, 0x1(%[pTC])                           \n\t"
-    "lb         %[pTC], 0x0(%[pTC])                       \n\t"
-    "move       $12, %[pTC]                               \n\t"
-    "and        %[pTC], $9, 0xFFFF                        \n\t"
-    "dmtc1      %[pTC], $f4                               \n\t"
-    "and        %[pTC], $9, 0xFFFF                        \n\t"
-    "dmtc1      %[pTC], $f8                               \n\t"
-    "move       %[pTC], $11                               \n\t"
-    "and        $9, %[pTC], 0xFFFF                        \n\t"
-    "and        %[pTC], %[pTC], 0xFFFF                    \n\t"
-    "dmtc1      %[pTC], $f16                              \n\t"
-    "and        %[pTC], $8, 0xFFFF                        \n\t"
-    "dmtc1      %[pTC], $f20                              \n\t"
-    "dmtc1      $9, $f12                                  \n\t"
-    "and        %[pTC], $8, 0xFFFF                        \n\t"
-    "dmtc1      %[pTC], $f24                              \n\t"
-    "move       %[pTC], $12                               \n\t"
-    "and        $9, %[pTC], 0xFFFF                        \n\t"
-    "and        %[pTC], %[pTC], 0xFFFF                    \n\t"
-    "punpcklhw  $f24, $f24, $f8                           \n\t"
-    "xor        $f0, $f0, $f0                             \n\t"
-    "xor        $f2, $f2, $f2                             \n\t"
-    "gssqc1     $f2, $f0, 0x40(%[tmp])                    \n\t"
-    "dmtc1      $9, $f28                                  \n\t"
-    "dmtc1      %[pTC], $f0                               \n\t"
-    "daddu      %[pTC], %[iStride], %[iStride]            \n\t"
-    "dsubu      $9, %[pPixCb], %[pTC]                     \n\t"
-    "punpcklhw  $f20, $f20, $f4                           \n\t"
-    "gslqc1     $f6, $f4, 0x40(%[tmp])                    \n\t"
-    "punpcklhw  $f0, $f0, $f16                            \n\t"
-    "gsldxc1    $f16, 0x0(%[iStride], %[pPixCr])          \n\t"
-    "punpcklhw  $f28, $f28, $f12                          \n\t"
-    "gsldxc1    $f12, 0x0(%[pPixCb], $0)                  \n\t"
-    "punpcklhw  $f0, $f0, $f24                            \n\t"
-    "gsldxc1    $f24, 0x0($9, $0)                         \n\t"
-    "punpcklhw  $f28, $f28, $f20                          \n\t"
-    "punpckhhw  $f2, $f0, $f28                            \n\t"
-    "punpcklhw  $f0, $f0, $f28                            \n\t"
-    "dsubu      $9, %[pPixCr], %[pTC]                     \n\t"
-    "psubh      $f8, $f4, $f0                             \n\t"
-    "psubh      $f10, $f6, $f2                            \n\t"
-    "gssqc1     $f10, $f8, 0x60(%[tmp])                   \n\t"
-    "gsldxc1    $f8, 0x0($9, $0)                          \n\t"
-    "mov.d      $f26, $f8                                 \n\t"
-    "dsubu      %[pTC], %[pPixCb], %[iStride]             \n\t"
-    "gsldxc1    $f28, 0x0(%[pTC], $0)                     \n\t"
-    "dsubu      $9, %[pPixCr], %[iStride]                 \n\t"
-    "gsldxc1    $f8, 0x0($9, $0)                          \n\t"
-    "mov.d      $f30, $f8                                 \n\t"
-    "gsldxc1    $f8, 0x0(%[pPixCr], $0)                   \n\t"
-    "mov.d      $f14, $f8                                 \n\t"
-    "gsldxc1    $f8, 0x0(%[iStride], %[pPixCb])           \n\t"
-    "mov.d      $f10, $f16                                \n\t"
-    "gssqc1     $f10, $f8, 0xE0(%[tmp])                   \n\t"
-    "dmtc1      %[iAlpha], $f8                            \n\t"
-    "punpcklhw  $f16, $f8, $f8                            \n\t"
-    "dmtc1      %[iBeta], $f8                             \n\t"
-    "punpcklhw  $f20, $f8, $f8                            \n\t"
-    "punpcklwd  $f8, $f20, $f20                           \n\t"
-    "mov.d      $f10, $f8                                 \n\t"
-    "gssqc1     $f10, $f8, 0x50(%[tmp])                   \n\t"
-    "punpckhbh  $f10, $f24, $f4                           \n\t"
-    "punpcklbh  $f8, $f24, $f4                            \n\t"
-    "gssqc1     $f14, $f12, 0xd0(%[tmp])                  \n\t"
-    "punpcklwd  $f16, $f16, $f16                          \n\t"
-    "mov.d      $f18, $f16                                \n\t"
-    "gssqc1     $f10, $f8, 0x30(%[tmp])                   \n\t"
-    "punpcklbh  $f24, $f26, $f6                           \n\t"
-    "punpckhbh  $f26, $f26, $f6                           \n\t"
-    "gssqc1     $f26, $f24, 0x80(%[tmp])                  \n\t"
-    "gslqc1     $f26, $f24, 0xd0(%[tmp])                  \n\t"
-    "punpcklbh  $f24, $f26, $f6                           \n\t"
-    "punpckhbh  $f26, $f26, $f6                           \n\t"
-    "gssqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
-    "gslqc1     $f26, $f24, 0xe0(%[tmp])                  \n\t"
-    "punpcklbh  $f24, $f26, $f6                           \n\t"
-    "punpckhbh  $f26, $f26, $f6                           \n\t"
-    "gssqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
-    "gslqc1     $f22, $f20, 0xe0(%[tmp])                  \n\t"
-    "mov.d      $f8, $f28                                 \n\t"
-    "mov.d      $f10, $f30                                \n\t"
-    "punpcklbh  $f28, $f30, $f6                           \n\t"
-    "punpckhbh  $f30, $f30, $f6                           \n\t"
-    "punpckhbh  $f22, $f20, $f4                           \n\t"
-    "punpcklbh  $f20, $f20, $f4                           \n\t"
-    "gssqc1     $f30, $f28, 0xa0(%[tmp])                  \n\t"
-    "punpckhbh  $f14, $f12, $f4                           \n\t"
-    "punpcklbh  $f12, $f12, $f4                           \n\t"
-    "dli        %[iBeta], 0x4                             \n\t"
-    "punpckhbh  $f10, $f8, $f4                            \n\t"
-    "punpcklbh  $f8, $f8, $f4                             \n\t"
-    "dmtc1      %[iBeta], $f24                            \n\t"
-    "punpcklhw  $f28, $f24, $f24                          \n\t"
-    "punpcklwd  $f24, $f28, $f28                          \n\t"
-    "mov.d      $f26, $f24                                \n\t"
-    "gslqc1     $f30, $f28, 0x30(%[tmp])                  \n\t"
-    "gssqc1     $f26, $f24, 0x20(%[tmp])                  \n\t"
-    "psubh      $f28, $f28, $f20                          \n\t"
-    "psubh      $f30, $f30, $f22                          \n\t"
-    "pcmpgth    $f24, $f0, $f4                            \n\t"
-    "pcmpgth    $f26, $f2, $f6                            \n\t"
-    "gslqc1     $f6, $f4, 0x60(%[tmp])                    \n\t"
-    "gssqc1     $f26, $f24, 0x40(%[tmp])                  \n\t"
-    "psubh      $f24, $f12, $f8                           \n\t"
-    "psubh      $f26, $f14, $f10                          \n\t"
-    "dmfc1      %[iAlpha], $f12                           \n\t"
-    "dmfc1      %[iBeta], $f14                            \n\t"
-    "dli        $10, 0x2                                  \n\t"
-    "dmtc1      $10, $f12                                 \n\t"
-    "dli        $10, 0x3                                  \n\t"
-    "dmtc1      $10, $f14                                 \n\t"
-    "psllh      $f24, $f24, $f12                          \n\t"
-    "psllh      $f26, $f26, $f12                          \n\t"
-    "paddh      $f24, $f24, $f28                          \n\t"
-    "paddh      $f26, $f26, $f30                          \n\t"
-    "gslqc1     $f30, $f28, 0x20(%[tmp])                  \n\t"
-    "paddh      $f24, $f24, $f28                          \n\t"
-    "paddh      $f26, $f26, $f30                          \n\t"
-    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
-    "psrah      $f24, $f24, $f14                          \n\t"
-    "psrah      $f26, $f26, $f14                          \n\t"
-    "dmtc1      %[iAlpha], $f12                           \n\t"
-    "dmtc1      %[iBeta], $f14                            \n\t"
-    "pmaxsh     $f4, $f4, $f24                            \n\t"
-    "pmaxsh     $f6, $f6, $f26                            \n\t"
-    "gssqc1     $f2, $f0, 0x10(%[tmp])                    \n\t"
-    "gslqc1     $f26, $f24, 0x10(%[tmp])                  \n\t"
-    "pminsh     $f24, $f24, $f4                           \n\t"
-    "pminsh     $f26, $f26, $f6                           \n\t"
-    "gssqc1     $f26, $f24, 0x10(%[tmp])                  \n\t"
-    "psubh      $f4, $f8, $f12                            \n\t"
-    "psubh      $f6, $f10, $f14                           \n\t"
-    WELS_AbsH($f4, $f6, $f4, $f6, $f24, $f26)
-    "pcmpgth    $f24, $f16, $f4                           \n\t"
-    "pcmpgth    $f26, $f18, $f6                           \n\t"
-    "gslqc1     $f6, $f4, 0x30(%[tmp])                    \n\t"
-    "psubh      $f4, $f4, $f8                             \n\t"
-    "psubh      $f6, $f6, $f10                            \n\t"
-    "dmfc1      %[iAlpha], $f8                            \n\t"
-    "dmfc1      %[iBeta], $f10                            \n\t"
-    WELS_AbsH($f4, $f6, $f4, $f6, $f8, $f10)
-    "pcmpgth    $f28, $f28, $f4                           \n\t"
-    "pcmpgth    $f30, $f30, $f6                           \n\t"
-    "gslqc1     $f6, $f4, 0x50(%[tmp])                    \n\t"
-    "and        $f24, $f24, $f28                          \n\t"
-    "and        $f26, $f26, $f30                          \n\t"
-    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
-    "psubh      $f20, $f20, $f12                          \n\t"
-    "psubh      $f22, $f22, $f14                          \n\t"
-    WELS_AbsH($f20, $f22, $f20, $f22, $f8, $f10)
-    "pcmpgth    $f4, $f4, $f20                            \n\t"
-    "pcmpgth    $f6, $f6, $f22                            \n\t"
-    "gslqc1     $f22, $f20, 0x80(%[tmp])                  \n\t"
-    "gslqc1     $f10, $f8, 0x90(%[tmp])                   \n\t"
-    "psubh      $f20, $f20, $f8                           \n\t"
-    "psubh      $f22, $f22, $f10                          \n\t"
-    "and        $f24, $f24, $f4                           \n\t"
-    "and        $f26, $f26, $f6                           \n\t"
-    "gslqc1     $f10, $f8, 0x40(%[tmp])                   \n\t"
-    "and        $f24, $f24, $f8                           \n\t"
-    "and        $f26, $f26, $f10                          \n\t"
-    "gslqc1     $f6, $f4, 0x10(%[tmp])                    \n\t"
-    "and        $f4, $f4, $f24                            \n\t"
-    "and        $f6, $f6, $f26                            \n\t"
-    "gslqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
-    "gssqc1     $f6, $f4, 0x30(%[tmp])                    \n\t"
-    "gslqc1     $f6, $f4, 0xa0(%[tmp])                    \n\t"
-    "psubh      $f24, $f24, $f4                           \n\t"
-    "psubh      $f26, $f26, $f6                           \n\t"
-    "dli        $10, 0x2                                  \n\t"
-    "dmtc1      $10, $f8                                  \n\t"
-    "psllh      $f24, $f24, $f8                           \n\t"
-    "psllh      $f26, $f26, $f8                           \n\t"
-    "paddh      $f24, $f24, $f20                          \n\t"
-    "paddh      $f26, $f26, $f22                          \n\t"
-    "dli        $10, 0x3                                  \n\t"
-    "gslqc1     $f10, $f8, 0x20(%[tmp])                   \n\t"
-    "paddh      $f24, $f24, $f8                           \n\t"
-    "paddh      $f26, $f26, $f10                          \n\t"
-    "dmtc1      $10, $f8                                  \n\t"
-    "gslqc1     $f22, $f20, 0x60(%[tmp])                  \n\t"
-    "psrah      $f24, $f24, $f8                           \n\t"
-    "psrah      $f26, $f26, $f8                           \n\t"
-    "pmaxsh     $f20, $f20, $f24                          \n\t"
-    "pmaxsh     $f22, $f22, $f26                          \n\t"
-    "pminsh     $f0, $f0, $f20                            \n\t"
-    "pminsh     $f2, $f2, $f22                            \n\t"
-    "gslqc1     $f22, $f20, 0x70(%[tmp])                  \n\t"
-    "psubh      $f24, $f4, $f20                           \n\t"
-    "psubh      $f26, $f6, $f22                           \n\t"
-    WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
-    "pcmpgth    $f16, $f16, $f24                          \n\t"
-    "pcmpgth    $f18, $f18, $f26                          \n\t"
-    "gslqc1     $f26, $f24, 0x80(%[tmp])                  \n\t"
-    "psubh      $f24, $f24, $f4                           \n\t"
-    "psubh      $f26, $f26, $f6                           \n\t"
-    WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
-    "pcmpgth    $f28, $f28, $f24                          \n\t"
-    "pcmpgth    $f30, $f30, $f26                          \n\t"
-    "gslqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
-    "and        $f16, $f16, $f28                          \n\t"
-    "and        $f18, $f18, $f30                          \n\t"
-    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
-    "psubh      $f24, $f24, $f20                          \n\t"
-    "psubh      $f26, $f26, $f22                          \n\t"
-    WELS_AbsH($f24, $f26, $f24, $f26, $f8, $f10)
-    "dmtc1      %[iAlpha], $f8                            \n\t"
-    "dmtc1      %[iBeta], $f10                            \n\t"
-    "pcmpgth    $f28, $f28, $f24                          \n\t"
-    "pcmpgth    $f30, $f30, $f26                          \n\t"
-    "and        $f16, $f16, $f28                          \n\t"
-    "and        $f18, $f18, $f30                          \n\t"
-    "gslqc1     $f26, $f24, 0x40(%[tmp])                  \n\t"
-    "and        $f16, $f16, $f24                          \n\t"
-    "and        $f18, $f18, $f26                          \n\t"
-    "and        $f0, $f0, $f16                            \n\t"
-    "and        $f2, $f2, $f18                            \n\t"
-    "gslqc1     $f18, $f16, 0x30(%[tmp])                  \n\t"
-    "paddh      $f8, $f8, $f16                            \n\t"
-    "paddh      $f10, $f10, $f18                          \n\t"
-    "paddh      $f4, $f4, $f0                             \n\t"
-    "paddh      $f6, $f6, $f2                             \n\t"
-    "packushb   $f8, $f8, $f10                            \n\t"
-    "packushb   $f10, $f4, $f6                            \n\t"
-    "gssdxc1    $f8, 0x0(%[pTC], $0)                      \n\t"
-    "psubh      $f12, $f12, $f16                          \n\t"
-    "psubh      $f14, $f14, $f18                          \n\t"
-    "psubh      $f20, $f20, $f0                           \n\t"
-    "psubh      $f22, $f22, $f2                           \n\t"
-    "packushb   $f12, $f12, $f14                          \n\t"
-    "packushb   $f14, $f20, $f22                          \n\t"
-    "gssdxc1    $f12, 0x0(%[pPixCb], $0)                  \n\t"
-    "gssdxc1    $f10, 0x0($9, $0)                         \n\t"
-    "gssdxc1    $f14, 0x0(%[pPixCr], $0)                  \n\t"
-    : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
-    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha), [iBeta]"r"(iBeta),
-      [pTC]"r"((unsigned char *)pTC), [tmp]"r"((unsigned char *)tmp)
-    : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
-      "$f10", "$f12",  "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
-      "$f28", "$f30"
-  );
-  RECOVER_REG;
-}
-
-void DeblockChromaEq4V_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
-                           int32_t iAlpha, int32_t iBeta) {
-  unsigned char tmp[128] __attribute__((aligned(32)));
-  BACKUP_REG;
-  __asm__ volatile (
-    ".set       arch=loongson3a                          \n\t"
-    "daddu      $8, %[iStride], %[iStride]               \n\t"
-    "dsubu      $9, %[pPixCb], $8                        \n\t"
-    "gsldxc1    $f16, 0x0(%[pPixCr], $0)                 \n\t"
-    "gsldxc1    $f20, 0x0(%[iStride], %[pPixCr])         \n\t"
-    "gsldxc1    $f4, 0x0($9, $0)                         \n\t"
-    "dsubu      $9, %[pPixCr], $8                        \n\t"
-    "gsldxc1    $f8, 0x0($9, $0)                         \n\t"
-    "mov.d      $f6, $f8                                 \n\t"
-    "dsubu      $8, %[pPixCb], %[iStride]                \n\t"
-    "gsldxc1    $f8, 0x0($8, $0)                         \n\t"
-    "dsubu      $9, %[pPixCr], %[iStride]                \n\t"
-    "gsldxc1    $f12, 0x0($9, $0)                        \n\t"
-    "mov.d      $f10, $f12                               \n\t"
-    "gsldxc1    $f12, 0x0(%[pPixCb], $0)                 \n\t"
-    "mov.d      $f14, $f16                               \n\t"
-    "gsldxc1    $f16, 0x0(%[iStride], %[pPixCb])         \n\t"
-    "mov.d      $f18, $f20                               \n\t"
-    "dmtc1      %[iAlpha], $f20                          \n\t"
-    "xor        $f0, $f0, $f0                            \n\t"
-    "xor        $f2, $f2, $f2                            \n\t"
-    "punpcklhw  $f24, $f20, $f20                         \n\t"
-    "punpcklwd  $f20, $f24, $f24                         \n\t"
-    "mov.d      $f22, $f20                               \n\t"
-    "dmtc1      %[iBeta], $f24                           \n\t"
-    "punpcklhw  $f28, $f24, $f24                         \n\t"
-    "punpcklwd  $f24, $f28, $f28                         \n\t"
-    "mov.d      $f26, $f24                               \n\t"
-    "mov.d      $f28, $f4                                \n\t"
-    "punpcklbh  $f4, $f6, $f2                            \n\t"
-    "punpckhbh  $f6, $f6, $f2                            \n\t"
-    "punpckhbh  $f30, $f28, $f0                          \n\t"
-    "punpcklbh  $f28, $f28, $f0                          \n\t"
-    "gssqc1     $f6, $f4, 0x40(%[tmp])                   \n\t"
-    "gssqc1     $f30, $f28, 0x60(%[tmp])                 \n\t"
-    "punpckhbh  $f30, $f8, $f0                           \n\t"
-    "punpcklbh  $f28, $f8, $f0                           \n\t"
-    "gssqc1     $f30, $f28, 0x10(%[tmp])                 \n\t"
-    "punpckhbh  $f30, $f12, $f0                          \n\t"
-    "punpcklbh  $f28, $f12, $f0                          \n\t"
-    "punpcklbh  $f12, $f14, $f2                          \n\t"
-    "punpckhbh  $f14, $f14, $f2                          \n\t"
-    "gssqc1     $f30, $f28, 0x50(%[tmp])                 \n\t"
-    "mov.d      $f28, $f16                               \n\t"
-    "punpcklbh  $f16, $f18, $f2                          \n\t"
-    "punpckhbh  $f18, $f18, $f2                          \n\t"
-    "punpcklbh  $f8, $f10, $f2                           \n\t"
-    "punpckhbh  $f10, $f10, $f2                          \n\t"
-    "punpckhbh  $f30, $f28, $f0                          \n\t"
-    "punpcklbh  $f28, $f28, $f0                          \n\t"
-    "gssqc1     $f14, $f12, 0x30(%[tmp])                 \n\t"
-    "gslqc1     $f14, $f12, 0x10(%[tmp])                 \n\t"
-    "gslqc1     $f2, $f0, 0x50(%[tmp])                   \n\t"
-    "psubh      $f4, $f12, $f0                           \n\t"
-    "psubh      $f6, $f14, $f2                           \n\t"
-    WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
-    "gssqc1     $f18, $f16, 0x20(%[tmp])                 \n\t"
-    "pcmpgth    $f0, $f20, $f4                           \n\t"
-    "pcmpgth    $f2, $f22, $f6                           \n\t"
-    "gslqc1     $f6, $f4, 0x60(%[tmp])                   \n\t"
-    "psubh      $f4, $f4, $f12                           \n\t"
-    "psubh      $f6, $f6, $f14                           \n\t"
-    WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
-    "pcmpgth    $f16, $f24, $f4                          \n\t"
-    "pcmpgth    $f18, $f26, $f6                          \n\t"
-    "and        $f0, $f0, $f16                           \n\t"
-    "and        $f2, $f2, $f18                           \n\t"
-    "gslqc1     $f18, $f16, 0x50(%[tmp])                 \n\t"
-    "psubh      $f4, $f28, $f16                          \n\t"
-    "psubh      $f6, $f30, $f18                          \n\t"
-    WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
-    "pcmpgth    $f16, $f24, $f4                          \n\t"
-    "pcmpgth    $f18, $f26, $f6                          \n\t"
-    "gslqc1     $f6, $f4, 0x30(%[tmp])                   \n\t"
-    "psubh      $f4, $f8, $f4                            \n\t"
-    "psubh      $f6, $f10, $f6                           \n\t"
-    "dmfc1      %[iAlpha], $f28                          \n\t"
-    "dmfc1      %[iBeta], $f30                           \n\t"
-    WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
-    "pcmpgth    $f20, $f20, $f4                          \n\t"
-    "pcmpgth    $f22, $f22, $f6                          \n\t"
-    "gslqc1     $f6, $f4, 0x40(%[tmp])                   \n\t"
-    "and        $f0, $f0, $f16                           \n\t"
-    "and        $f2, $f2, $f18                           \n\t"
-    "psubh      $f4, $f4, $f8                            \n\t"
-    "psubh      $f6, $f6, $f10                           \n\t"
-    WELS_AbsH($f4, $f6, $f4, $f6, $f16, $f18)
-    "pcmpgth    $f16, $f24, $f4                          \n\t"
-    "pcmpgth    $f18, $f26, $f6                          \n\t"
-    "gslqc1     $f6, $f4, 0x20(%[tmp])                   \n\t"
-    "gslqc1     $f30, $f28, 0x30(%[tmp])                 \n\t"
-    "psubh      $f4, $f4, $f28                           \n\t"
-    "psubh      $f6, $f6, $f30                           \n\t"
-    "and        $f20, $f20, $f16                         \n\t"
-    "and        $f22, $f22, $f18                         \n\t"
-    WELS_AbsH($f4, $f6, $f4, $f6, $f28, $f30)
-    "dmtc1      %[iAlpha], $f28                          \n\t"
-    "dmtc1      %[iBeta], $f30                           \n\t"
-    "pcmpgth    $f24, $f24, $f4                          \n\t"
-    "pcmpgth    $f26, $f26, $f6                          \n\t"
-    "and        $f20, $f20, $f24                         \n\t"
-    "and        $f22, $f22, $f26                         \n\t"
-    "dli        %[iBeta], 0x2                            \n\t"
-    "dmtc1      %[iBeta], $f4                            \n\t"
-    "punpcklhw  $f16, $f4, $f4                           \n\t"
-    "punpcklwd  $f4, $f16, $f16                          \n\t"
-    "mov.d      $f6, $f4                                 \n\t"
-    "gslqc1     $f18, $f16, 0x60(%[tmp])                 \n\t"
-    "paddh      $f24, $f16, $f16                         \n\t"
-    "paddh      $f26, $f18, $f18                         \n\t"
-    "paddh      $f24, $f24, $f12                         \n\t"
-    "paddh      $f26, $f26, $f14                         \n\t"
-    "paddh      $f24, $f24, $f28                         \n\t"
-    "paddh      $f26, $f26, $f30                         \n\t"
-    "gssqc1     $f6, $f4, 0x10(%[tmp])                   \n\t"
-    "gslqc1     $f18, $f16, 0x10(%[tmp])                 \n\t"
-    "paddh      $f24, $f24, $f16                         \n\t"
-    "paddh      $f26, $f26, $f18                         \n\t"
-    "dmtc1      %[iBeta], $f16                           \n\t"
-    "psrah      $f24, $f24, $f16                         \n\t"
-    "psrah      $f26, $f26, $f16                         \n\t"
-    "pandn      $f16, $f0, $f12                          \n\t"
-    "pandn      $f18, $f2, $f14                          \n\t"
-    "gslqc1     $f14, $f12, 0x40(%[tmp])                 \n\t"
-    "and        $f4, $f0, $f24                           \n\t"
-    "and        $f6, $f2, $f26                           \n\t"
-    "or         $f4, $f4, $f16                           \n\t"
-    "or         $f6, $f6, $f18                           \n\t"
-    "paddh      $f24, $f12, $f12                         \n\t"
-    "paddh      $f26, $f14, $f14                         \n\t"
-    "gslqc1     $f14, $f12, 0x10(%[tmp])                 \n\t"
-    "paddh      $f24, $f24, $f8                          \n\t"
-    "paddh      $f26, $f26, $f10                         \n\t"
-    "gslqc1     $f18, $f16, 0x20(%[tmp])                 \n\t"
-    "paddh      $f24, $f24, $f16                         \n\t"
-    "paddh      $f26, $f26, $f18                         \n\t"
-    "dmtc1      %[iBeta], $f16                           \n\t"
-    "paddh      $f24, $f24, $f12                         \n\t"
-    "paddh      $f26, $f26, $f14                         \n\t"
-    "psrah      $f24, $f24, $f16                         \n\t"
-    "psrah      $f26, $f26, $f16                         \n\t"
-    "and        $f16, $f20, $f24                         \n\t"
-    "and        $f18, $f22, $f26                         \n\t"
-    "pandn      $f24, $f20, $f8                          \n\t"
-    "pandn      $f26, $f22, $f10                         \n\t"
-    "or         $f16, $f16, $f24                         \n\t"
-    "or         $f18, $f18, $f26                         \n\t"
-    "packushb   $f4, $f4, $f6                            \n\t"
-    "packushb   $f6, $f16, $f18                          \n\t"
-    "gslqc1     $f18, $f16, 0x50(%[tmp])                 \n\t"
-    "paddh      $f24, $f28, $f28                         \n\t"
-    "paddh      $f26, $f30, $f30                         \n\t"
-    "paddh      $f24, $f24, $f16                         \n\t"
-    "paddh      $f26, $f26, $f18                         \n\t"
-    "gslqc1     $f10, $f8, 0x60(%[tmp])                  \n\t"
-    "paddh      $f24, $f24, $f8                          \n\t"
-    "paddh      $f26, $f26, $f10                         \n\t"
-    "dmtc1      %[iBeta], $f28                           \n\t"
-    "paddh      $f24, $f24, $f12                         \n\t"
-    "paddh      $f26, $f26, $f14                         \n\t"
-    "psrah      $f24, $f24, $f28                         \n\t"
-    "psrah      $f26, $f26, $f28                         \n\t"
-    "and        $f8, $f0, $f24                           \n\t"
-    "and        $f10, $f2, $f26                          \n\t"
-    "pandn      $f0, $f0, $f16                           \n\t"
-    "pandn      $f2, $f2, $f18                           \n\t"
-    "or         $f8, $f8, $f0                            \n\t"
-    "or         $f10, $f10, $f2                          \n\t"
-    "gslqc1     $f2, $f0, 0x20(%[tmp])                   \n\t"
-    "paddh      $f24, $f0, $f0                           \n\t"
-    "paddh      $f26, $f2, $f2                           \n\t"
-    "gslqc1     $f2, $f0, 0x30(%[tmp])                   \n\t"
-    "paddh      $f24, $f24, $f0                          \n\t"
-    "paddh      $f26, $f26, $f2                          \n\t"
-    "gslqc1     $f18, $f16, 0x40(%[tmp])                 \n\t"
-    "paddh      $f24, $f24, $f16                         \n\t"
-    "paddh      $f26, $f26, $f18                         \n\t"
-    "paddh      $f24, $f24, $f12                         \n\t"
-    "paddh      $f26, $f26, $f14                         \n\t"
-    "gssdxc1    $f4, 0x0($8, $0)                         \n\t"
-    "psrah      $f24, $f24, $f28                         \n\t"
-    "psrah      $f26, $f26, $f28                         \n\t"
-    "and        $f16, $f20, $f24                         \n\t"
-    "and        $f18, $f22, $f26                         \n\t"
-    "pandn      $f20, $f20, $f0                          \n\t"
-    "pandn      $f22, $f22, $f2                          \n\t"
-    "or         $f16, $f16, $f20                         \n\t"
-    "or         $f18, $f18, $f22                         \n\t"
-    "packushb   $f8, $f8, $f10                           \n\t"
-    "packushb   $f10, $f16, $f18                         \n\t"
-    "gssdxc1    $f8, 0x0(%[pPixCb], $0)                  \n\t"
-    "gssdxc1    $f6, 0x0($9, $0)                         \n\t"
-    "gssdxc1    $f10, 0x0(%[pPixCr], $0)                 \n\t"
-    : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
-    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
-      [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
-    : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8",
-      "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
-      "$f28", "$f30"
-  );
-  RECOVER_REG;
-}
-
-void DeblockChromaEq4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
-                           int32_t iAlpha, int32_t iBeta) {
-  unsigned char tmp[256] __attribute__((aligned(32)));
-  BACKUP_REG;
-  __asm__ volatile (
-    ".set       arch=loongson3a                           \n\t"
-    "daddiu     %[pPixCb], %[pPixCb], -0x2                \n\t"
-    "daddiu     %[pPixCr], %[pPixCr], -0x2                \n\t"
-    "move       $9, %[pPixCb]                             \n\t"
-    "move       $10, %[pPixCr]                            \n\t"
-    "dsll       $11, %[iStride], 0x2                      \n\t"
-    "daddu      %[pPixCb], %[pPixCb], $11                 \n\t"
-    "daddu      %[pPixCr], %[pPixCr], $11                 \n\t"
-    "daddiu     $11, %[tmp], 0x80                         \n\t"
-    "gsldlc1    $f0, 0x7($9)                              \n\t"
-    "gsldrc1    $f0, 0x0($9)                              \n\t"
-    "daddu      $12, $9, %[iStride]                       \n\t"
-    "gsldlc1    $f4, 0x7($12)                             \n\t"
-    "gsldrc1    $f4, 0x0($12)                             \n\t"
-    "daddu      $12, $12, %[iStride]                      \n\t"
-    "gsldlc1    $f8, 0x7($12)                             \n\t"
-    "gsldrc1    $f8, 0x0($12)                             \n\t"
-    "daddu      $12, $12, %[iStride]                      \n\t"
-    "gsldlc1    $f12, 0x7($12)                            \n\t"
-    "gsldlc1    $f16, 0x7($10)                            \n\t"
-    "gsldrc1    $f12, 0x0($12)                            \n\t"
-    "gsldrc1    $f16, 0x0($10)                            \n\t"
-    "daddu      $12, $10, %[iStride]                      \n\t"
-    "gsldlc1    $f20, 0x7($12)                            \n\t"
-    "gsldrc1    $f20, 0x0($12)                            \n\t"
-    "daddu      $12, $12, %[iStride]                      \n\t"
-    "gsldlc1    $f24, 0x7($12)                            \n\t"
-    "gsldrc1    $f24, 0x0($12)                            \n\t"
-    "daddu      $12, $12, %[iStride]                      \n\t"
-    "gsldlc1    $f28, 0x7($12)                            \n\t"
-    "gsldrc1    $f28, 0x0($12)                            \n\t"
-    "punpcklwd  $f0, $f0, $f16                            \n\t"
-    "punpcklwd  $f4, $f4, $f20                            \n\t"
-    "punpcklwd  $f8, $f8, $f24                            \n\t"
-    "punpcklwd  $f12, $f12, $f28                          \n\t"
-    "gsldlc1    $f16, 0x7(%[pPixCb])                      \n\t"
-    "gsldlc1    $f20, 0x7(%[pPixCr])                      \n\t"
-    "gsldrc1    $f16, 0x0(%[pPixCb])                      \n\t"
-    "gsldrc1    $f20, 0x0(%[pPixCr])                      \n\t"
-    "punpcklwd  $f16, $f16, $f20                          \n\t"
-    "mov.d      $f2, $f16                                 \n\t"
-    "daddu      $12, %[pPixCb], %[iStride]                \n\t"
-    "daddu      $13, %[pPixCr], %[iStride]                \n\t"
-    "gsldlc1    $f16, 0x7($12)                            \n\t"
-    "gsldlc1    $f20, 0x7($13)                            \n\t"
-    "gsldrc1    $f16, 0x0($12)                            \n\t"
-    "gsldrc1    $f20, 0x0($13)                            \n\t"
-    "punpcklwd  $f16, $f16, $f20                          \n\t"
-    "mov.d      $f6, $f16                                 \n\t"
-    "daddu      $12, $12, %[iStride]                      \n\t"
-    "daddu      $13, $13, %[iStride]                      \n\t"
-    "gsldlc1    $f16, 0x7($12)                            \n\t"
-    "gsldlc1    $f20, 0x7($13)                            \n\t"
-    "gsldrc1    $f16, 0x0($12)                            \n\t"
-    "gsldrc1    $f20, 0x0($13)                            \n\t"
-    "punpcklwd  $f16, $f16, $f20                          \n\t"
-    "mov.d      $f10, $f16                                \n\t"
-    "daddu      $12, $12, %[iStride]                      \n\t"
-    "daddu      $13, $13, %[iStride]                      \n\t"
-    "gsldlc1    $f16, 0x7($12)                            \n\t"
-    "gsldlc1    $f20, 0x7($13)                            \n\t"
-    "gsldrc1    $f16, 0x0($12)                            \n\t"
-    "gsldrc1    $f20, 0x0($13)                            \n\t"
-    "punpcklwd  $f16, $f16, $f20                          \n\t"
-    "mov.d      $f14, $f16                                \n\t"
-    "punpcklbh  $f24, $f2, $f6                            \n\t"
-    "punpckhbh  $f26, $f2, $f6                            \n\t"
-    "punpckhbh  $f2, $f0, $f4                             \n\t"
-    "punpcklbh  $f0, $f0, $f4                             \n\t"
-    "punpcklbh  $f28, $f10, $f14                          \n\t"
-    "punpckhbh  $f30, $f10, $f14                          \n\t"
-    "punpckhbh  $f10, $f8, $f12                           \n\t"
-    "punpcklbh  $f8, $f8, $f12                            \n\t"
-    "punpcklhw  $f16, $f2, $f10                           \n\t"
-    "punpckhhw  $f18, $f2, $f10                           \n\t"
-    "punpckhhw  $f2, $f0, $f8                             \n\t"
-    "punpcklhw  $f0, $f0, $f8                             \n\t"
-    "punpcklhw  $f20, $f26, $f30                          \n\t"
-    "punpckhhw  $f22, $f26, $f30                          \n\t"
-    "punpckhhw  $f26, $f24, $f28                          \n\t"
-    "punpcklhw  $f24, $f24, $f28                          \n\t"
-    "punpcklwd  $f4, $f2, $f26                            \n\t"
-    "punpckhwd  $f6, $f2, $f26                            \n\t"
-    "punpckhwd  $f2, $f0, $f24                            \n\t"
-    "punpcklwd  $f0, $f0, $f24                            \n\t"
-    "punpcklwd  $f8, $f18, $f22                           \n\t"
-    "punpckhwd  $f10, $f18, $f22                          \n\t"
-    "punpckhwd  $f18, $f16, $f20                          \n\t"
-    "punpcklwd  $f16, $f16, $f20                          \n\t"
-    "mov.d      $f20, $f2                                 \n\t"
-    "mov.d      $f22, $f18                                \n\t"
-    "mov.d      $f2, $f16                                 \n\t"
-    "mov.d      $f24, $f6                                 \n\t"
-    "mov.d      $f26, $f10                                \n\t"
-    "mov.d      $f6, $f8                                  \n\t"
-    "gssqc1     $f2, $f0, 0x0($11)                        \n\t"
-    "gssqc1     $f22, $f20, 0x10($11)                     \n\t"
-    "gssqc1     $f6, $f4, 0x20($11)                       \n\t"
-    "gssqc1     $f26, $f24, 0x30($11)                     \n\t"
-    "gslqc1     $f26, $f24, 0x80(%[tmp])                  \n\t"
-    "gslqc1     $f18, $f16, 0x90(%[tmp])                  \n\t"
-    "gslqc1     $f22, $f20, 0xa0(%[tmp])                  \n\t"
-    "gslqc1     $f30, $f28, 0xb0(%[tmp])                  \n\t"
-    "xor        $f0, $f0, $f0                             \n\t"
-    "dmtc1      %[iAlpha], $f4                            \n\t"
-    "punpcklhw  $f8, $f4, $f4                             \n\t"
-    "punpcklwd  $f4, $f8, $f8                             \n\t"
-    "mov.d      $f6, $f4                                  \n\t"
-    "dmtc1      %[iBeta], $f8                             \n\t"
-    "punpcklhw  $f12, $f8, $f8                            \n\t"
-    "punpcklwd  $f8, $f12, $f12                           \n\t"
-    "mov.d      $f10, $f8                                 \n\t"
-    "mov.d      $f12, $f24                                \n\t"
-    "punpcklbh  $f24, $f26, $f0                           \n\t"
-    "punpckhbh  $f26, $f26, $f0                           \n\t"
-    "gssqc1     $f26, $f24, 0x60(%[tmp])                  \n\t"
-    "gslqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
-    "punpcklbh  $f24, $f26, $f0                           \n\t"
-    "punpckhbh  $f26, $f26, $f0                           \n\t"
-    "gssqc1     $f26, $f24, 0x30(%[tmp])                  \n\t"
-    "gslqc1     $f26, $f24, 0xa0(%[tmp])                  \n\t"
-    "punpcklbh  $f24, $f26, $f0                           \n\t"
-    "punpckhbh  $f26, $f26, $f0                           \n\t"
-    "gssqc1     $f26, $f24, 0x40(%[tmp])                  \n\t"
-    "gslqc1     $f26, $f24, 0xb0(%[tmp])                  \n\t"
-    "punpcklbh  $f24, $f26, $f0                           \n\t"
-    "punpckhbh  $f26, $f26, $f0                           \n\t"
-    "gssqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
-    "punpckhbh  $f30, $f28, $f0                           \n\t"
-    "punpcklbh  $f28, $f28, $f0                           \n\t"
-    "punpckhbh  $f18, $f16, $f0                           \n\t"
-    "punpcklbh  $f16, $f16, $f0                           \n\t"
-    "punpckhbh  $f22, $f20, $f0                           \n\t"
-    "punpcklbh  $f20, $f20, $f0                           \n\t"
-    "punpckhbh  $f14, $f12, $f0                           \n\t"
-    "punpcklbh  $f12, $f12, $f0                           \n\t"
-    "gssqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
-    "psubh      $f24, $f16, $f20                          \n\t"
-    "psubh      $f26, $f18, $f22                          \n\t"
-    WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
-    "pcmpgth    $f0, $f4, $f24                            \n\t"
-    "pcmpgth    $f2, $f6, $f26                            \n\t"
-    "psubh      $f24, $f12, $f16                          \n\t"
-    "psubh      $f26, $f14, $f18                          \n\t"
-    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
-    "pcmpgth    $f28, $f8, $f24                           \n\t"
-    "pcmpgth    $f30, $f10, $f26                          \n\t"
-    "gslqc1     $f26, $f24, 0x50(%[tmp])                  \n\t"
-    "psubh      $f24, $f24, $f20                          \n\t"
-    "psubh      $f26, $f26, $f22                          \n\t"
-    "and        $f0, $f0, $f28                            \n\t"
-    "and        $f2, $f2, $f30                            \n\t"
-    WELS_AbsH($f24, $f26, $f24, $f26, $f28, $f30)
-    "dmfc1      %[iAlpha], $f20                           \n\t"
-    "dmfc1      %[iBeta], $f22                            \n\t"
-    "pcmpgth    $f28, $f8, $f24                           \n\t"
-    "pcmpgth    $f30, $f10, $f26                          \n\t"
-    "gslqc1     $f26, $f24, 0x30(%[tmp])                  \n\t"
-    "gslqc1     $f22, $f20, 0x40(%[tmp])                  \n\t"
-    "psubh      $f24, $f24, $f20                          \n\t"
-    "psubh      $f26, $f26, $f22                          \n\t"
-    WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
-    "pcmpgth    $f4, $f4, $f24                            \n\t"
-    "pcmpgth    $f6, $f6, $f26                            \n\t"
-    "gslqc1     $f26, $f24, 0x60(%[tmp])                  \n\t"
-    "gslqc1     $f22, $f20, 0x30(%[tmp])                  \n\t"
-    "psubh      $f24, $f24, $f20                          \n\t"
-    "psubh      $f26, $f26, $f22                          \n\t"
-    WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
-    "and        $f0, $f0, $f28                            \n\t"
-    "and        $f2, $f2, $f30                            \n\t"
-    "pcmpgth    $f28, $f8, $f24                           \n\t"
-    "pcmpgth    $f30, $f10, $f26                          \n\t"
-    "gslqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
-    "gslqc1     $f22, $f20, 0x40(%[tmp])                  \n\t"
-    "psubh      $f24, $f24, $f20                          \n\t"
-    "psubh      $f26, $f26, $f22                          \n\t"
-    WELS_AbsH($f24, $f26, $f24, $f26, $f20, $f22)
-    "dli        $8, 0x2                                   \n\t"
-    "and        $f4, $f4, $f28                            \n\t"
-    "and        $f6, $f6, $f30                            \n\t"
-    "pcmpgth    $f8, $f8, $f24                            \n\t"
-    "pcmpgth    $f10, $f10, $f26                          \n\t"
-    "and        $f4, $f4, $f8                             \n\t"
-    "and        $f6, $f6, $f10                            \n\t"
-    "dmtc1      $8, $f8                                   \n\t"
-    "punpcklhw  $f24, $f8, $f8                            \n\t"
-    "punpcklwd  $f8, $f24, $f24                           \n\t"
-    "mov.d      $f10, $f8                                 \n\t"
-    "gssqc1     $f10, $f8, 0x20(%[tmp])                   \n\t"
-    "paddh      $f8, $f12, $f12                           \n\t"
-    "paddh      $f10, $f14, $f14                          \n\t"
-    "paddh      $f8, $f8, $f16                            \n\t"
-    "paddh      $f10, $f10, $f18                          \n\t"
-    "gslqc1     $f22, $f20, 0x50(%[tmp])                  \n\t"
-    "paddh      $f8, $f8, $f20                            \n\t"
-    "paddh      $f10, $f10, $f22                          \n\t"
-    "gslqc1     $f26, $f24, 0x20(%[tmp])                  \n\t"
-    "paddh      $f8, $f8, $f24                            \n\t"
-    "paddh      $f10, $f10, $f26                          \n\t"
-    "dmtc1      $8, $f20                                  \n\t"
-    "psrah      $f8, $f8, $f20                            \n\t"
-    "psrah      $f10, $f10, $f20                          \n\t"
-    "and        $f24, $f0, $f8                            \n\t"
-    "and        $f26, $f2, $f10                           \n\t"
-    "pandn      $f8, $f0, $f16                            \n\t"
-    "pandn      $f10, $f2, $f18                           \n\t"
-    "or         $f24, $f24, $f8                           \n\t"
-    "or         $f26, $f26, $f10                          \n\t"
-    "gslqc1     $f10, $f8, 0x60(%[tmp])                   \n\t"
-    "paddh      $f28, $f8, $f8                            \n\t"
-    "paddh      $f30, $f10, $f10                          \n\t"
-    "gslqc1     $f22, $f20, 0x30(%[tmp])                  \n\t"
-    "paddh      $f28, $f28, $f20                          \n\t"
-    "paddh      $f30, $f30, $f22                          \n\t"
-    "gslqc1     $f18, $f16, 0x70(%[tmp])                  \n\t"
-    "paddh      $f28, $f28, $f16                          \n\t"
-    "paddh      $f30, $f30, $f18                          \n\t"
-    "gslqc1     $f10, $f8, 0x20(%[tmp])                   \n\t"
-    "paddh      $f28, $f28, $f8                           \n\t"
-    "paddh      $f30, $f30, $f10                          \n\t"
-    "pandn      $f8, $f4, $f20                            \n\t"
-    "pandn      $f10, $f6, $f22                           \n\t"
-    "dmtc1      $8, $f20                                  \n\t"
-    "psrah      $f28, $f28, $f20                          \n\t"
-    "psrah      $f30, $f30, $f20                          \n\t"
-    "and        $f16, $f4, $f28                           \n\t"
-    "and        $f18, $f6, $f30                           \n\t"
-    "or         $f16, $f16, $f8                           \n\t"
-    "or         $f18, $f18, $f10                          \n\t"
-    "gslqc1     $f10, $f8, 0x50(%[tmp])                   \n\t"
-    "packushb   $f24, $f24, $f26                          \n\t"
-    "packushb   $f26, $f16, $f18                          \n\t"
-    "gssqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
-    "paddh      $f24, $f8, $f8                            \n\t"
-    "paddh      $f26, $f10, $f10                          \n\t"
-    "dmtc1      %[iAlpha], $f20                           \n\t"
-    "dmtc1      %[iBeta], $f22                            \n\t"
-    "gslqc1     $f10, $f8, 0x20(%[tmp])                   \n\t"
-    "paddh      $f24, $f24, $f20                          \n\t"
-    "paddh      $f26, $f26, $f22                          \n\t"
-    "paddh      $f24, $f24, $f12                          \n\t"
-    "paddh      $f26, $f26, $f14                          \n\t"
-    "mov.d      $f16, $f0                                 \n\t"
-    "mov.d      $f18, $f2                                 \n\t"
-    "pandn      $f0, $f0, $f20                            \n\t"
-    "pandn      $f2, $f2, $f22                            \n\t"
-    "dmtc1      $8, $f20                                  \n\t"
-    "paddh      $f24, $f24, $f8                           \n\t"
-    "paddh      $f26, $f26, $f10                          \n\t"
-    "psrah      $f24, $f24, $f20                          \n\t"
-    "psrah      $f26, $f26, $f20                          \n\t"
-    "and        $f16, $f16, $f24                          \n\t"
-    "and        $f18, $f18, $f26                          \n\t"
-    "or         $f16, $f16, $f0                           \n\t"
-    "or         $f18, $f18, $f2                           \n\t"
-    "gslqc1     $f2, $f0, 0x70(%[tmp])                    \n\t"
-    "paddh      $f20, $f0, $f0                            \n\t"
-    "paddh      $f22, $f2, $f2                            \n\t"
-    "gslqc1     $f2, $f0, 0x40(%[tmp])                    \n\t"
-    "paddh      $f20, $f20, $f0                           \n\t"
-    "paddh      $f22, $f22, $f2                           \n\t"
-    "gslqc1     $f14, $f12, 0x60(%[tmp])                  \n\t"
-    "paddh      $f20, $f20, $f12                          \n\t"
-    "paddh      $f22, $f22, $f14                          \n\t"
-    "paddh      $f20, $f20, $f8                           \n\t"
-    "paddh      $f22, $f22, $f10                          \n\t"
-    "dmtc1      $8, $f8                                   \n\t"
-    "psrah      $f20, $f20, $f8                           \n\t"
-    "psrah      $f22, $f22, $f8                           \n\t"
-    "and        $f12, $f4, $f20                           \n\t"
-    "and        $f14, $f6, $f22                           \n\t"
-    "pandn      $f4, $f4, $f0                             \n\t"
-    "pandn      $f6, $f6, $f2                             \n\t"
-    "or         $f12, $f12, $f4                           \n\t"
-    "or         $f14, $f14, $f6                           \n\t"
-    "packushb   $f16, $f16, $f18                          \n\t"
-    "packushb   $f18, $f12, $f14                          \n\t"
-    "gssqc1     $f18, $f16, 0xa0(%[tmp])                  \n\t"
-    "gslqc1     $f2, $f0, 0x0($11)                        \n\t"
-    "gslqc1     $f6, $f4, 0x10($11)                       \n\t"
-    "gslqc1     $f10, $f8, 0x20($11)                      \n\t"
-    "gslqc1     $f14, $f12, 0x30($11)                     \n\t"
-    "mov.d      $f26, $f2                                 \n\t"
-    "punpckhbh  $f2, $f0, $f4                             \n\t"
-    "punpcklbh  $f0, $f0, $f4                             \n\t"
-    "punpcklbh  $f24, $f26, $f6                           \n\t"
-    "punpckhbh  $f26, $f26, $f6                           \n\t"
-    "mov.d      $f30, $f10                                \n\t"
-    "punpckhbh  $f10, $f8, $f12                           \n\t"
-    "punpcklbh  $f8, $f8, $f12                            \n\t"
-    "punpcklbh  $f28, $f30, $f14                          \n\t"
-    "punpckhbh  $f30, $f30, $f14                          \n\t"
-    "punpcklhw  $f16, $f2, $f10                           \n\t"
-    "punpckhhw  $f18, $f2, $f10                           \n\t"
-    "punpcklhw  $f20, $f26, $f30                          \n\t"
-    "punpckhhw  $f22, $f26, $f30                          \n\t"
-    "punpckhhw  $f2, $f0, $f8                             \n\t"
-    "punpcklhw  $f0, $f0, $f8                             \n\t"
-    "punpckhhw  $f26, $f24, $f28                          \n\t"
-    "punpcklhw  $f24, $f24, $f28                          \n\t"
-    "punpcklwd  $f4, $f2, $f26                            \n\t"
-    "punpckhwd  $f6, $f2, $f26                            \n\t"
-    "punpcklwd  $f8, $f18, $f22                           \n\t"
-    "punpckhwd  $f10, $f18, $f22                          \n\t"
-    "punpckhwd  $f2, $f0, $f24                            \n\t"
-    "punpcklwd  $f0, $f0, $f24                            \n\t"
-    "punpckhwd  $f18, $f16, $f20                          \n\t"
-    "punpcklwd  $f16, $f16, $f20                          \n\t"
-    "mov.d      $f20, $f2                                 \n\t"
-    "mov.d      $f24, $f6                                 \n\t"
-    "mov.d      $f2, $f16                                 \n\t"
-    "mov.d      $f22, $f18                                \n\t"
-    "mov.d      $f6, $f8                                  \n\t"
-    "mov.d      $f26, $f10                                \n\t"
-    "dli        %[iAlpha], 0x20                           \n\t"
-    "dmtc1      %[iAlpha], $f8                            \n\t"
-    "gsswlc1    $f0, 0x3($9)                              \n\t"
-    "gsswrc1    $f0, 0x0($9)                              \n\t"
-    "daddu      $12, $9, %[iStride]                       \n\t"
-    "gsswlc1    $f20, 0x3($12)                            \n\t"
-    "gsswrc1    $f20, 0x0($12)                            \n\t"
-    "daddu      $12, $12, %[iStride]                      \n\t"
-    "gsswlc1    $f4, 0x3($12)                             \n\t"
-    "gsswrc1    $f4, 0x0($12)                             \n\t"
-    "daddu      $12, $12, %[iStride]                      \n\t"
-    "gsswlc1    $f24, 0x3($12)                            \n\t"
-    "gsswrc1    $f24, 0x0($12)                            \n\t"
-    "dsrl       $f0, $f0, $f8                             \n\t"
-    "dsrl       $f20, $f20, $f8                           \n\t"
-    "dsrl       $f4, $f4, $f8                             \n\t"
-    "dsrl       $f24, $f24, $f8                           \n\t"
-    "gsswlc1    $f0, 0x3($10)                             \n\t"
-    "gsswrc1    $f0, 0x0($10)                             \n\t"
-    "daddu      $13, $10, %[iStride]                      \n\t"
-    "daddu      $8, $13, %[iStride]                       \n\t"
-    "gsswlc1    $f20, 0x3($13)                            \n\t"
-    "gsswrc1    $f20, 0x0($13)                            \n\t"
-    "daddu      $13, $8, %[iStride]                       \n\t"
-    "gsswlc1    $f4, 0x3($8)                              \n\t"
-    "gsswrc1    $f4, 0x0($8)                              \n\t"
-    "gsswlc1    $f24, 0x3($13)                            \n\t"
-    "gsswrc1    $f24, 0x0($13)                            \n\t"
-    "gsswlc1    $f2, 0x3(%[pPixCb])                       \n\t"
-    "gsswrc1    $f2, 0x0(%[pPixCb])                       \n\t"
-    "daddu      $12, %[pPixCb], %[iStride]                \n\t"
-    "gsswlc1    $f22, 0x3($12)                            \n\t"
-    "gsswrc1    $f22, 0x0($12)                            \n\t"
-    "daddu      $12, $12, %[iStride]                      \n\t"
-    "gsswlc1    $f6, 0x3($12)                             \n\t"
-    "gsswrc1    $f6, 0x0($12)                             \n\t"
-    "daddu      $12, $12, %[iStride]                      \n\t"
-    "gsswlc1    $f26, 0x3($12)                            \n\t"
-    "gsswrc1    $f26, 0x0($12)                            \n\t"
-    "dsrl       $f2, $f2, $f8                             \n\t"
-    "dsrl       $f22, $f22, $f8                           \n\t"
-    "dsrl       $f6, $f6, $f8                             \n\t"
-    "dsrl       $f26, $f26, $f8                           \n\t"
-    "gsswlc1    $f2, 0x3(%[pPixCr])                       \n\t"
-    "gsswrc1    $f2, 0x0(%[pPixCr])                       \n\t"
-    "daddu      $13, %[pPixCr], %[iStride]                \n\t"
-    "daddu      $8, $13, %[iStride]                       \n\t"
-    "gsswlc1    $f22, 0x3($13)                            \n\t"
-    "gsswrc1    $f22, 0x0($13)                            \n\t"
-    "daddu      $13, $8, %[iStride]                       \n\t"
-    "gsswlc1    $f6, 0x3($8)                              \n\t"
-    "gsswrc1    $f6, 0x0($8)                              \n\t"
-    "gsswlc1    $f26, 0x3($13)                            \n\t"
-    "gsswrc1    $f26, 0x0($13)                            \n\t"
-    : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
-    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
-      [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp)
-    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
-      "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22",
-      "$f24", "$f26", "$f28", "$f30"
-  );
-  RECOVER_REG;
-}
-
-void DeblockChromaLt4H_mmi(uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride,
-                           int32_t iAlpha, int32_t iBeta, int8_t *pTC) {
-  unsigned char tmp[320] __attribute__((aligned(32)));
-  BACKUP_REG;
-  __asm__ volatile (
-    ".set       arch=loongson3a                           \n\t"
-    "daddiu     %[pPixCb], %[pPixCb], -0x2                \n\t"
-    "daddiu     %[pPixCr], %[pPixCr], -0x2                \n\t"
-    "daddu      $8, %[pPixCb], %[iStride]                 \n\t"
-    "gsldlc1    $f0, 0x7(%[pPixCb])                       \n\t"
-    "gsldlc1    $f4, 0x7($8)                              \n\t"
-    "gsldrc1    $f0, 0x0(%[pPixCb])                       \n\t"
-    "gsldrc1    $f4, 0x0($8)                              \n\t"
-    "daddu      $9, $8, %[iStride]                        \n\t"
-    "daddu      $8, $9, %[iStride]                        \n\t"
-    "gsldlc1    $f8, 0x7($9)                              \n\t"
-    "gsldlc1    $f12, 0x7($8)                             \n\t"
-    "gsldrc1    $f8, 0x0($9)                              \n\t"
-    "gsldrc1    $f12, 0x0($8)                             \n\t"
-    "daddu      $9, $8, %[iStride]                        \n\t"
-
-    "daddu      $10, %[pPixCr], %[iStride]                \n\t"
-    "gsldlc1    $f16, 0x7(%[pPixCr])                      \n\t"
-    "gsldlc1    $f20, 0x7($10)                            \n\t"
-    "gsldrc1    $f16, 0x0(%[pPixCr])                      \n\t"
-    "gsldrc1    $f20, 0x0($10)                            \n\t"
-    "daddu      $11, $10, %[iStride]                      \n\t"
-    "daddu      $10, $11, %[iStride]                      \n\t"
-    "gsldlc1    $f24, 0x7($11)                            \n\t"
-    "gsldlc1    $f28, 0x7($10)                            \n\t"
-    "gsldrc1    $f24, 0x0($11)                            \n\t"
-    "gsldrc1    $f28, 0x0($10)                            \n\t"
-    "daddu      $11, $10, %[iStride]                      \n\t"
-
-    "punpcklwd  $f0, $f0, $f16                            \n\t"
-    "punpcklwd  $f4, $f4, $f20                            \n\t"
-    "punpcklwd  $f8, $f8, $f24                            \n\t"
-    "punpcklwd  $f12, $f12, $f28                          \n\t"
-    "gsldlc1    $f16, 0x7($9)                             \n\t"
-    "gsldlc1    $f20, 0x7($11)                            \n\t"
-    "gsldrc1    $f16, 0x0($9)                             \n\t"
-    "gsldrc1    $f20, 0x0($11)                            \n\t"
-    "punpcklwd  $f16, $f16, $f20                          \n\t"
-    "mov.d      $f2, $f16                                 \n\t"
-    "daddu      $8, $9, %[iStride]                        \n\t"
-    "daddu      $10, $11, %[iStride]                      \n\t"
-    "gsldlc1    $f16, 0x7($8)                             \n\t"
-    "gsldlc1    $f20, 0x7($10)                            \n\t"
-    "gsldrc1    $f16, 0x0($8)                             \n\t"
-    "gsldrc1    $f20, 0x0($10)                            \n\t"
-    "punpcklwd  $f16, $f16, $f20                          \n\t"
-    "mov.d      $f6, $f16                                 \n\t"
-    "daddu      $9, $8, %[iStride]                        \n\t"
-    "daddu      $11, $10, %[iStride]                      \n\t"
-
-    "gsldlc1    $f16, 0x7($9)                             \n\t"
-    "gsldlc1    $f20, 0x7($11)                            \n\t"
-    "gsldrc1    $f16, 0x0($9)                             \n\t"
-    "gsldrc1    $f20, 0x0($11)                            \n\t"
-    "punpcklwd  $f16, $f16, $f20                          \n\t"
-    "mov.d      $f10, $f16                                \n\t"
-    "daddu      $8, $9, %[iStride]                        \n\t"
-    "daddu      $10, $11, %[iStride]                      \n\t"
-
-    "gsldlc1    $f16, 0x7($8)                             \n\t"
-    "gsldlc1    $f20, 0x7($10)                            \n\t"
-    "gsldrc1    $f16, 0x0($8)                             \n\t"
-    "gsldrc1    $f20, 0x0($10)                            \n\t"
-    "punpcklwd  $f16, $f16, $f20                          \n\t"
-    "mov.d      $f14, $f16                                \n\t"
-
-    "punpcklbh  $f24, $f2, $f6                            \n\t"
-    "punpckhbh  $f26, $f2, $f6                            \n\t"
-    "punpckhbh  $f2, $f0, $f4                             \n\t"
-    "punpcklbh  $f0, $f0, $f4                             \n\t"
-    "punpcklbh  $f28, $f10, $f14                          \n\t"
-    "punpckhbh  $f30, $f10, $f14                          \n\t"
-    "punpckhbh  $f10, $f8, $f12                           \n\t"
-    "punpcklbh  $f8, $f8, $f12                            \n\t"
-
-    "punpcklhw  $f16, $f2, $f10                           \n\t"
-    "punpckhhw  $f18, $f2, $f10                           \n\t"
-    "punpckhhw  $f2, $f0, $f8                             \n\t"
-    "punpcklhw  $f0, $f0, $f8                             \n\t"
-    "punpcklhw  $f20, $f26, $f30                          \n\t"
-    "punpckhhw  $f22, $f26, $f30                          \n\t"
-    "punpckhhw  $f26, $f24, $f28                          \n\t"
-    "punpcklhw  $f24, $f24, $f28                          \n\t"
-
-    "punpcklwd  $f4, $f2, $f26                            \n\t"
-    "punpckhwd  $f6, $f2, $f26                            \n\t"
-    "punpckhwd  $f2, $f0, $f24                            \n\t"
-    "punpcklwd  $f0, $f0, $f24                            \n\t"
-    "punpcklwd  $f8, $f18, $f22                           \n\t"
-    "punpckhwd  $f10, $f18, $f22                          \n\t"
-    "punpckhwd  $f18, $f16, $f20                          \n\t"
-    "punpcklwd  $f16, $f16, $f20                          \n\t"
-
-    "mov.d      $f20, $f2                                 \n\t"
-    "mov.d      $f22, $f18                                \n\t"
-    "mov.d      $f2, $f16                                 \n\t"
-    "mov.d      $f24, $f6                                 \n\t"
-    "mov.d      $f26, $f10                                \n\t"
-    "mov.d      $f6, $f8                                  \n\t"
-    "daddiu     $11, %[tmp], 0x70                         \n\t"
-
-    "gssqc1     $f2, $f0, 0x0($11)                        \n\t"
-    "gssqc1     $f22, $f20, 0x10($11)                     \n\t"
-    "gssqc1     $f6, $f4, 0x20($11)                       \n\t"
-    "gssqc1     $f26, $f24, 0x30($11)                     \n\t"
-
-    "lb         $8, 0x3(%[pTC])                           \n\t"
-    "lb         $9, 0x2(%[pTC])                           \n\t"
-    "lb         $10, 0x1(%[pTC])                          \n\t"
-    "lb         $11, 0x0(%[pTC])                          \n\t"
-
-    "and        $12, $8, 0xFFFF                           \n\t"
-    "dmtc1      $12, $f8                                  \n\t"
-
-    "and        $9, $9, 0xFFFF                            \n\t"
-    "dmtc1      $9, $f12                                  \n\t"
-    "mov.d      $f16, $f12                                \n\t"
-
-    "and        $9, $10, 0xFFFF                           \n\t"
-    "dmtc1      $9, $f20                                  \n\t"
-    "xor        $f0, $f0, $f0                             \n\t"
-    "mov.d      $f24, $f20                                \n\t"
-    "and        $9, $11, 0xFFFF                           \n\t"
-    "punpcklhw  $f24, $f24, $f8                           \n\t"
-
-    "mov.d      $f4, $f8                                  \n\t"
-    "dmtc1      $9, $f28                                  \n\t"
-    "mov.d      $f0, $f28                                 \n\t"
-
-    "punpcklhw  $f28, $f28, $f12                          \n\t"
-    "punpcklhw  $f20, $f20, $f4                           \n\t"
-    "xor        $f4, $f4, $f4                             \n\t"
-    "xor        $f6, $f6, $f6                             \n\t"
-    "punpcklhw  $f28, $f28, $f20                          \n\t"
-    "gslqc1     $f22, $f20, 0xA0(%[tmp])                  \n\t"
-    "punpcklhw  $f0, $f0, $f16                            \n\t"
-    "punpcklhw  $f0, $f0, $f24                            \n\t"
-
-    "gslqc1     $f26, $f24, 0x70(%[tmp])                  \n\t"
-    "punpckhhw  $f2, $f0, $f28                            \n\t"
-    "punpcklhw  $f0, $f0, $f28                            \n\t"
-    "gslqc1     $f30, $f28, 0x80(%[tmp])                  \n\t"
-    "psubh      $f8, $f4, $f0                             \n\t"
-    "psubh      $f10, $f6, $f2                            \n\t"
-    "gssqc1     $f10, $f8, 0xD0(%[tmp])                   \n\t"
-    "dmtc1      %[iAlpha], $f8                            \n\t"
-    "punpcklhw  $f12, $f8, $f8                            \n\t"
-    "punpcklwd  $f16, $f12, $f12                          \n\t"
-    "mov.d      $f18, $f16                                \n\t"
-
-    "dmtc1      %[iBeta], $f8                             \n\t"
-    "punpcklhw  $f12, $f8, $f8                            \n\t"
-    "punpcklwd  $f8, $f12, $f12                           \n\t"
-    "mov.d      $f10, $f8                                 \n\t"
-
-    "gslqc1     $f14, $f12, 0x90(%[tmp])                  \n\t"
-    "gssqc1     $f10, $f8, 0x50(%[tmp])                   \n\t"
-    "punpckhbh  $f10, $f24, $f4                           \n\t"
-    "punpcklbh  $f8, $f24, $f4                            \n\t"
-    "punpcklbh  $f24, $f26, $f6                           \n\t"
-    "punpckhbh  $f26, $f26, $f6                           \n\t"
-
-    "gssqc1     $f10, $f8, 0x40(%[tmp])                   \n\t"
-    "gssqc1     $f26, $f24, 0xB0(%[tmp])                  \n\t"
-    "gslqc1     $f26, $f24, 0x90(%[tmp])                  \n\t"
-    "punpcklbh  $f8, $f28, $f4                            \n\t"
-    "punpckhbh  $f10, $f28, $f4                           \n\t"
-    "punpcklbh  $f28, $f30, $f6                           \n\t"
-    "punpckhbh  $f30, $f30, $f6                           \n\t"
-    "punpcklbh  $f24, $f26, $f6                           \n\t"
-    "punpckhbh  $f26, $f26, $f6                           \n\t"
-    "punpckhbh  $f14, $f12, $f4                           \n\t"
-    "punpcklbh  $f12, $f12, $f4                           \n\t"
-    "punpckhbh  $f22, $f20, $f4                           \n\t"
-    "punpcklbh  $f20, $f20, $f4                           \n\t"
-    "gssqc1     $f30, $f28, 0xF0(%[tmp])                  \n\t"
-    "gssqc1     $f26, $f24, 0xC0(%[tmp])                  \n\t"
-    "gslqc1     $f26, $f24, 0xA0(%[tmp])                  \n\t"
-    "punpcklbh  $f24, $f26, $f6                           \n\t"
-    "punpckhbh  $f26, $f26, $f6                           \n\t"
-
-    "dli        $13, 0x4                                  \n\t"
-    "gssqc1     $f26, $f24, 0xE0(%[tmp])                  \n\t"
-    "dmtc1      $13, $f24                                 \n\t"
-    "punpcklhw  $f28, $f24, $f24                          \n\t"
-    "punpcklwd  $f24, $f28, $f28                          \n\t"
-    "mov.d      $f26, $f24                                \n\t"
-    "dli        $12, 0x2                                  \n\t"
-    "dli        $13, 0x3                                  \n\t"
-
-    "gssqc1     $f2, $f0, 0x20(%[tmp])                    \n\t"
-    "dmfc1      %[iAlpha], $f0                            \n\t"
-    "dmfc1      %[iBeta], $f2                             \n\t"
-    "gssqc1     $f26, $f24, 0x30(%[tmp])                  \n\t"
-    "gslqc1     $f30, $f28, 0x40(%[tmp])                  \n\t"
-    "psubh      $f28, $f28, $f20                          \n\t"
-    "psubh      $f30, $f30, $f22                          \n\t"
-    "pcmpgth    $f24, $f0, $f4                            \n\t"
-    "pcmpgth    $f26, $f2, $f6                            \n\t"
-
-    "dmtc1      $12, $f0                                  \n\t"
-    "dmtc1      $13, $f2                                  \n\t"
-    "gssqc1     $f26, $f24, 0x60(%[tmp])                  \n\t"
-    "gslqc1     $f6, $f4, 0xD0(%[tmp])                    \n\t"
-    "psubh      $f24, $f12, $f8                           \n\t"
-    "psubh      $f26, $f14, $f10                          \n\t"
-    "psllh      $f24, $f24, $f0                           \n\t"
-    "psllh      $f26, $f26, $f0                           \n\t"
-    "paddh      $f24, $f24, $f28                          \n\t"
-    "paddh      $f26, $f26, $f30                          \n\t"
-    "gslqc1     $f30, $f28, 0x30(%[tmp])                  \n\t"
-    "paddh      $f24, $f24, $f28                          \n\t"
-    "paddh      $f26, $f26, $f30                          \n\t"
-    "psrah      $f24, $f24, $f2                           \n\t"
-    "psrah      $f26, $f26, $f2                           \n\t"
-    "pmaxsh     $f4, $f4, $f24                            \n\t"
-    "pmaxsh     $f6, $f6, $f26                            \n\t"
-
-    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
-    "gslqc1     $f26, $f24, 0x20(%[tmp])                  \n\t"
-    "pminsh     $f24, $f24, $f4                           \n\t"
-    "pminsh     $f26, $f26, $f6                           \n\t"
-
-    "gssqc1     $f26, $f24, 0x20(%[tmp])                  \n\t"
-    "psubh      $f4, $f8, $f12                            \n\t"
-    "psubh      $f6, $f10, $f14                           \n\t"
-    WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
-    "pcmpgth    $f24, $f16, $f4                           \n\t"
-    "pcmpgth    $f26, $f18, $f6                           \n\t"
-    "gslqc1     $f6, $f4, 0x40(%[tmp])                    \n\t"
-    "psubh      $f4, $f4, $f8                             \n\t"
-    "psubh      $f6, $f6, $f10                            \n\t"
-    WELS_AbsH($f4, $f6, $f4, $f6, $f0, $f2)
-    "pcmpgth    $f28, $f28, $f4                           \n\t"
-    "pcmpgth    $f30, $f30, $f6                           \n\t"
-
-    "gslqc1     $f6, $f4, 0x50(%[tmp])                    \n\t"
-    "and        $f24, $f24, $f28                          \n\t"
-    "and        $f26, $f26, $f30                          \n\t"
-    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
-    "psubh      $f20, $f20, $f12                          \n\t"
-    "psubh      $f22, $f22, $f14                          \n\t"
-    WELS_AbsH($f20, $f22, $f20, $f22, $f0, $f2)
-    "pcmpgth    $f4, $f4, $f20                            \n\t"
-    "pcmpgth    $f6, $f6, $f22                            \n\t"
-
-    "gslqc1     $f22, $f20, 0xB0(%[tmp])                  \n\t"
-    "gslqc1     $f2, $f0, 0xE0(%[tmp])                    \n\t"
-    "psubh      $f20, $f20, $f0                           \n\t"
-    "psubh      $f22, $f22, $f2                           \n\t"
-    "and        $f24, $f24, $f4                           \n\t"
-    "and        $f26, $f26, $f6                           \n\t"
-    "gslqc1     $f2, $f0, 0x60(%[tmp])                    \n\t"
-    "and        $f24, $f24, $f0                           \n\t"
-    "and        $f26, $f26, $f2                           \n\t"
-
-    "gslqc1     $f6, $f4, 0x20(%[tmp])                    \n\t"
-    "and        $f4, $f4, $f24                            \n\t"
-    "and        $f6, $f6, $f26                            \n\t"
-    "gslqc1     $f26, $f24, 0xC0(%[tmp])                  \n\t"
-    "gssqc1     $f6, $f4, 0x40(%[tmp])                    \n\t"
-    "gslqc1     $f6, $f4, 0xF0(%[tmp])                    \n\t"
-
-    "dmtc1      $12, $f0                                  \n\t"
-    "psubh      $f24, $f24, $f4                           \n\t"
-    "psubh      $f26, $f26, $f6                           \n\t"
-    "psllh      $f24, $f24, $f0                           \n\t"
-    "psllh      $f26, $f26, $f0                           \n\t"
-    "paddh      $f24, $f24, $f20                          \n\t"
-    "paddh      $f26, $f26, $f22                          \n\t"
-    "gslqc1     $f2, $f0, 0x30(%[tmp])                    \n\t"
-    "paddh      $f24, $f24, $f0                           \n\t"
-    "paddh      $f26, $f26, $f2                           \n\t"
-    "dmtc1      %[iBeta], $f2                             \n\t"
-
-    "dmtc1      $13, $f0                                  \n\t"
-    "gslqc1     $f22, $f20, 0xD0(%[tmp])                  \n\t"
-    "psrah      $f24, $f24, $f0                           \n\t"
-    "psrah      $f26, $f26, $f0                           \n\t"
-    "dmtc1      %[iAlpha], $f0                            \n\t"
-    "pmaxsh     $f20, $f20, $f24                          \n\t"
-    "pmaxsh     $f22, $f22, $f26                          \n\t"
-    "pminsh     $f0, $f0, $f20                            \n\t"
-    "pminsh     $f2, $f2, $f22                            \n\t"
-
-    "dmfc1      %[iAlpha], $f0                            \n\t"
-    "dmfc1      %[iBeta], $f2                             \n\t"
-    "gslqc1     $f22, $f20, 0xC0(%[tmp])                  \n\t"
-    "psubh      $f24, $f4, $f20                           \n\t"
-    "psubh      $f26, $f6, $f22                           \n\t"
-    WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
-    "pcmpgth    $f16, $f16, $f24                          \n\t"
-    "pcmpgth    $f18, $f18, $f26                          \n\t"
-
-    "gslqc1     $f26, $f24, 0xB0(%[tmp])                  \n\t"
-    "psubh      $f24, $f24, $f4                           \n\t"
-    "psubh      $f26, $f26, $f6                           \n\t"
-    WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
-    "pcmpgth    $f28, $f28, $f24                          \n\t"
-    "pcmpgth    $f30, $f30, $f26                          \n\t"
-
-    "gslqc1     $f26, $f24, 0xE0(%[tmp])                  \n\t"
-    "and        $f16, $f16, $f28                          \n\t"
-    "and        $f18, $f18, $f30                          \n\t"
-
-    "gslqc1     $f30, $f28, 0x50(%[tmp])                  \n\t"
-    "psubh      $f24, $f24, $f20                          \n\t"
-    "psubh      $f26, $f26, $f22                          \n\t"
-    WELS_AbsH($f24, $f26, $f24, $f26, $f0, $f2)
-    "pcmpgth    $f28, $f28, $f24                          \n\t"
-    "pcmpgth    $f30, $f30, $f26                          \n\t"
-    "and        $f16, $f16, $f28                          \n\t"
-    "and        $f18, $f18, $f30                          \n\t"
-    "gslqc1     $f30, $f28, 0x60(%[tmp])                  \n\t"
-    "dmtc1      %[iAlpha], $f0                            \n\t"
-    "dmtc1      %[iBeta], $f2                             \n\t"
-    "and        $f16, $f16, $f28                          \n\t"
-    "and        $f18, $f18, $f30                          \n\t"
-    "and        $f0, $f0, $f16                            \n\t"
-    "and        $f2, $f2, $f18                            \n\t"
-
-    "gslqc1     $f18, $f16, 0x40(%[tmp])                  \n\t"
-    "paddh      $f8, $f8, $f16                            \n\t"
-    "paddh      $f10, $f10, $f18                          \n\t"
-    "paddh      $f4, $f4, $f0                             \n\t"
-    "paddh      $f6, $f6, $f2                             \n\t"
-    "psubh      $f12, $f12, $f16                          \n\t"
-    "psubh      $f14, $f14, $f18                          \n\t"
-    "psubh      $f20, $f20, $f0                           \n\t"
-    "psubh      $f22, $f22, $f2                           \n\t"
-    "packushb   $f8, $f8, $f10                            \n\t"
-    "packushb   $f10, $f4, $f6                            \n\t"
-    "packushb   $f12, $f12, $f14                          \n\t"
-    "packushb   $f14, $f20, $f22                          \n\t"
-
-    "gssqc1     $f10, $f8, 0x80(%[tmp])                   \n\t"
-    "gssqc1     $f14, $f12, 0x90(%[tmp])                  \n\t"
-    "daddiu     $11, %[tmp], 0x70                         \n\t"
-
-    "gslqc1     $f2, $f0, 0x0($11)                        \n\t"
-    "gslqc1     $f6, $f4, 0x10($11)                       \n\t"
-    "gslqc1     $f10, $f8, 0x20($11)                      \n\t"
-    "gslqc1     $f14, $f12, 0x30($11)                     \n\t"
-
-    "punpcklbh  $f24, $f2, $f6                            \n\t"
-    "punpckhbh  $f26, $f2, $f6                            \n\t"
-    "punpckhbh  $f2, $f0, $f4                             \n\t"
-    "punpcklbh  $f0, $f0, $f4                             \n\t"
-
-    "punpcklbh  $f28, $f10, $f14                          \n\t"
-    "punpckhbh  $f30, $f10, $f14                          \n\t"
-    "punpckhbh  $f10, $f8, $f12                           \n\t"
-    "punpcklbh  $f8, $f8, $f12                            \n\t"
-
-    "punpcklhw  $f16, $f2, $f10                           \n\t"
-    "punpckhhw  $f18, $f2, $f10                           \n\t"
-    "punpckhhw  $f2, $f0, $f8                             \n\t"
-    "punpcklhw  $f0, $f0, $f8                             \n\t"
-    "punpcklhw  $f20, $f26, $f30                          \n\t"
-    "punpckhhw  $f22, $f26, $f30                          \n\t"
-    "punpckhhw  $f26, $f24, $f28                          \n\t"
-    "punpcklhw  $f24, $f24, $f28                          \n\t"
-
-    "punpcklwd  $f4, $f2, $f26                            \n\t"
-    "punpckhwd  $f6, $f2, $f26                            \n\t"
-    "punpckhwd  $f2, $f0, $f24                            \n\t"
-    "punpcklwd  $f0, $f0, $f24                            \n\t"
-    "punpcklwd  $f8, $f18, $f22                           \n\t"
-    "punpckhwd  $f10, $f18, $f22                          \n\t"
-    "punpckhwd  $f18, $f16, $f20                          \n\t"
-    "punpcklwd  $f16, $f16, $f20                          \n\t"
-
-    "mov.d      $f20, $f2                                 \n\t"
-    "mov.d      $f22, $f18                                \n\t"
-    "mov.d      $f2, $f16                                 \n\t"
-    "mov.d      $f24, $f6                                 \n\t"
-    "mov.d      $f26, $f10                                \n\t"
-    "mov.d      $f6, $f8                                  \n\t"
-
-    "dli        %[iAlpha], 0x20                           \n\t"
-    "daddu      $8, %[pPixCb], %[iStride]                 \n\t"
-    "gsswlc1    $f0, 0x3(%[pPixCb])                       \n\t"
-    "gsswlc1    $f20, 0x3($8)                             \n\t"
-    "gsswrc1    $f0, 0x0(%[pPixCb])                       \n\t"
-    "gsswrc1    $f20, 0x0($8)                             \n\t"
-    "daddu      $9, $8, %[iStride]                        \n\t"
-    "daddu      $8, $9, %[iStride]                        \n\t"
-    "gsswlc1    $f4, 0x3($9)                              \n\t"
-    "gsswlc1    $f24, 0x3($8)                             \n\t"
-    "gsswrc1    $f4, 0x0($9)                              \n\t"
-    "gsswrc1    $f24, 0x0($8)                             \n\t"
-    "daddu      $9, $8, %[iStride]                        \n\t"
-    "dmtc1      %[iAlpha], $f8                            \n\t"
-
-    "dsrl       $f0, $f0, $f8                             \n\t"
-    "dsrl       $f20, $f20, $f8                           \n\t"
-    "dsrl       $f4, $f4, $f8                             \n\t"
-    "dsrl       $f24, $f24, $f8                           \n\t"
-    "daddu      $10, %[pPixCr], %[iStride]                \n\t"
-    "gsswlc1    $f0, 0x3(%[pPixCr])                       \n\t"
-    "gsswlc1    $f20, 0x3($10)                            \n\t"
-    "gsswrc1    $f0, 0x0(%[pPixCr])                       \n\t"
-    "gsswrc1    $f20, 0x0($10)                            \n\t"
-    "daddu      $11, $10, %[iStride]                      \n\t"
-    "daddu      $10, $11, %[iStride]                      \n\t"
-    "gsswlc1    $f4, 0x3($11)                             \n\t"
-    "gsswlc1    $f24, 0x3($10)                            \n\t"
-    "gsswrc1    $f4, 0x0($11)                             \n\t"
-    "gsswrc1    $f24, 0x0($10)                            \n\t"
-    "daddu      $11, $10, %[iStride]                      \n\t"
-
-    "daddu      $8, $9, %[iStride]                        \n\t"
-    "gsswlc1    $f2, 0x3($9)                              \n\t"
-    "gsswlc1    $f22, 0x3($8)                             \n\t"
-    "gsswrc1    $f2, 0x0($9)                              \n\t"
-    "gsswrc1    $f22, 0x0($8)                             \n\t"
-    "daddu      $9, $8, %[iStride]                        \n\t"
-    "daddu      $8, $9, %[iStride]                        \n\t"
-    "gsswlc1    $f6, 0x3($9)                              \n\t"
-    "gsswlc1    $f26, 0x3($8)                             \n\t"
-    "gsswrc1    $f6, 0x0($9)                              \n\t"
-    "gsswrc1    $f26, 0x0($8)                             \n\t"
-
-    "dsrl       $f2, $f2, $f8                             \n\t"
-    "dsrl       $f22, $f22, $f8                           \n\t"
-    "dsrl       $f6, $f6, $f8                             \n\t"
-    "dsrl       $f26, $f26, $f8                           \n\t"
-    "daddu      $10, $11, %[iStride]                      \n\t"
-    "gsswlc1    $f2, 0x3($11)                             \n\t"
-    "gsswlc1    $f22, 0x3($10)                            \n\t"
-    "gsswrc1    $f2, 0x0($11)                             \n\t"
-    "gsswrc1    $f22, 0x0($10)                            \n\t"
-    "daddu      $11, $10, %[iStride]                      \n\t"
-    "daddu      $10, $11, %[iStride]                      \n\t"
-    "gsswlc1    $f6, 0x3($11)                             \n\t"
-    "gsswlc1    $f26, 0x3($10)                            \n\t"
-    "gsswrc1    $f6, 0x0($11)                             \n\t"
-    "gsswrc1    $f26, 0x0($10)                            \n\t"
-    : [pPixCb]"+&r"((unsigned char *)pPixCb), [pPixCr]"+&r"((unsigned char *)pPixCr)
-    : [iStride]"r"((int)iStride), [iAlpha]"r"(iAlpha),
-      [iBeta]"r"(iBeta), [tmp]"r"((unsigned char *)tmp), [pTC]"r"((char *)pTC)
-    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4",
-      "$f6", "$f8", "$f10", "$f12","$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
-      "$f26", "$f28", "$f30"
-  );
-  RECOVER_REG;
-}
-
-void WelsNonZeroCount_mmi(int8_t *pNonZeroCount) {
-  __asm__ volatile(
-    ".set       arch=loongson3a                 \n\t"
-    "gsldlc1    $f0, 0x7(%[pNonZeroCount])      \n\t"
-    "gsldlc1    $f2, 0xF(%[pNonZeroCount])      \n\t"
-    "gsldlc1    $f4, 0x17(%[pNonZeroCount])     \n\t"
-    "gsldrc1    $f4, 0x10(%[pNonZeroCount])     \n\t"
-    "gsldrc1    $f0, 0x0(%[pNonZeroCount])      \n\t"
-    "gsldrc1    $f2, 0x8(%[pNonZeroCount])      \n\t"
-    "pcmpeqh    $f8, $f8, $f8                   \n\t"
-    "dli        $8, 0xF                         \n\t"
-    "dmtc1      $8, $f6                         \n\t"
-    "psrlh      $f8, $f8, $f6                   \n\t"
-    "packushb   $f8, $f8, $f8                   \n\t"
-
-    "pminub     $f0, $f0, $f8                   \n\t"
-    "pminub     $f2, $f2, $f8                   \n\t"
-    "pminub     $f4, $f4, $f8                   \n\t"
-    "gssdlc1    $f0, 0x7(%[pNonZeroCount])      \n\t"
-    "gssdlc1    $f2, 0xF(%[pNonZeroCount])      \n\t"
-    "gssdlc1    $f4, 0x17(%[pNonZeroCount])     \n\t"
-    "gssdrc1    $f0, 0x0(%[pNonZeroCount])      \n\t"
-    "gssdrc1    $f2, 0x8(%[pNonZeroCount])      \n\t"
-    "gssdrc1    $f4, 0x10(%[pNonZeroCount])     \n\t"
-    :
-    : [pNonZeroCount] "r"((unsigned char *)pNonZeroCount)
-    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8"
-  );
-}
--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -63,14 +63,14 @@
 endif
 OBJS += $(COMMON_OBJSARM64)
 
-COMMON_ASM_MIPS64_SRCS=\
-	$(COMMON_SRCDIR)/mips64/deblock_mmi.c\
+COMMON_ASM_MIPS_SRCS=\
+	$(COMMON_SRCDIR)/mips/deblock_mmi.c\
 
-COMMON_OBJSMIPS64 += $(COMMON_ASM_MIPS64_SRCS:.c=.$(OBJ))
-ifeq ($(ASM_ARCH), mips64)
-COMMON_OBJS += $(COMMON_OBJSMIPS64)
+COMMON_OBJSMIPS += $(COMMON_ASM_MIPS_SRCS:.c=.$(OBJ))
+ifeq ($(ASM_ARCH), mips)
+COMMON_OBJS += $(COMMON_OBJSMIPS)
 endif
-OBJS += $(COMMON_OBJSMIPS64)
+OBJS += $(COMMON_OBJSMIPS)
 
 OBJS += $(COMMON_OBJS)
 
@@ -77,14 +77,14 @@
 $(COMMON_SRCDIR)/%.$(OBJ): $(COMMON_SRCDIR)/%.cpp
 	$(QUIET_CXX)$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(COMMON_CFLAGS) $(COMMON_INCLUDES) -c $(CXX_O) $<
 
+$(COMMON_SRCDIR)/%.$(OBJ): $(COMMON_SRCDIR)/%.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(INCLUDES) $(COMMON_CFLAGS) $(COMMON_INCLUDES) -c $(CXX_O) $<
+
 $(COMMON_SRCDIR)/%.$(OBJ): $(COMMON_SRCDIR)/%.asm
 	$(QUIET_ASM)$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(COMMON_ASMFLAGS) $(COMMON_ASM_INCLUDES) -o $@ $<
 
 $(COMMON_SRCDIR)/%.$(OBJ): $(COMMON_SRCDIR)/%.S
 	$(QUIET_CCAS)$(CCAS) $(CCASFLAGS) $(ASMFLAGS) $(INCLUDES) $(COMMON_CFLAGS) $(COMMON_INCLUDES) -c -o $@ $<
-
-$(COMMON_SRCDIR)/%.$(OBJ): $(COMMON_SRCDIR)/%.c
-	$(QUIET_CC)$(CC) $(CFLAGS) $(ASMFLAGS) $(INCLUDES) $(COMMON_CFLAGS) $(COMMON_INCLUDES) -c -o $@ $<
 
 $(LIBPREFIX)common.$(LIBSUFFIX): $(COMMON_OBJS)
 	$(QUIET)rm -f $@