shithub: openh264

Download patch

ref: 9e2abda78f0fc0e6a4c2a7a3e2e3067404acdade
parent: 07cb68d0a6e0a4dd71787021e57bc59fd3fcc543
author: gxw <[email protected]>
date: Fri Aug 10 07:22:31 EDT 2018

Add optimization files in codec/common/mips

1. Add copy_mb_mmi.c, expand_picture_mmi.c, satd_sad_mmi.c and
intra_pred_com_mmi.c in codec/common/mips
2. Modify codec/common/inc/asmdefs_mmi.c format

Change-Id: I065cdb7574067abfbd8701fe57d2a4fef043d398

--- a/codec/common/inc/asmdefs_mmi.h
+++ b/codec/common/inc/asmdefs_mmi.h
@@ -35,303 +35,306 @@
 
 #define CACHE_LINE_SIZE 32
 
-#if defined(__mips64) && defined(__LP64__)
+#if defined(_ABI64) && _MIPS_SIM == _ABI64
 # define mips_reg       int64_t
+# define PTRSIZE        " 8 "
+# define PTRLOG         " 3 "
 # define PTR_ADDU       "daddu "
 # define PTR_ADDIU      "daddiu "
 # define PTR_ADDI       "daddi "
 # define PTR_SUBU       "dsubu "
 # define PTR_L          "ld "
+# define PTR_S          "sd "
 # define PTR_SRA        "dsra "
 # define PTR_SRL        "dsrl "
 # define PTR_SLL        "dsll "
-# define PTR_MTC1       "dmtc1 "
-# define PTR_LI         "dli "
 #else
 # define mips_reg       int32_t
+# define PTRSIZE        " 4 "
+# define PTRLOG         " 2 "
 # define PTR_ADDU       "addu "
 # define PTR_ADDIU      "addiu "
 # define PTR_ADDI       "addi "
 # define PTR_SUBU       "subu "
 # define PTR_L          "lw "
+# define PTR_S          "sw "
 # define PTR_SRA        "sra "
 # define PTR_SRL        "srl "
 # define PTR_SLL        "sll "
-# define PTR_MTC1       "mtc1 "
-# define PTR_LI         "li "
 #endif
 
 #define MMI_XSawp_BH(f0, f2, f4, f6, f8, f10) \
-           "mov.d      "#f8", "#f2"                \n\t" \
-           "punpckhbh  "#f2", "#f0", "#f4"         \n\t" \
-           "punpcklbh  "#f0", "#f0", "#f4"         \n\t" \
-           "punpckhbh  "#f10", "#f8", "#f6"        \n\t" \
-           "punpcklbh  "#f8", "#f8", "#f6"         \n\t"
+  "mov.d      "#f8", "#f2"                \n\t" \
+  "punpckhbh  "#f2", "#f0", "#f4"         \n\t" \
+  "punpcklbh  "#f0", "#f0", "#f4"         \n\t" \
+  "punpckhbh  "#f10", "#f8", "#f6"        \n\t" \
+  "punpcklbh  "#f8", "#f8", "#f6"         \n\t"
 
 #define MMI_XSawp_HW(f0, f2, f4, f6, f8, f10) \
-           "mov.d      "#f8", "#f2"                \n\t" \
-           "punpckhhw  "#f2", "#f0", "#f4"         \n\t" \
-           "punpcklhw  "#f0", "#f0", "#f4"         \n\t" \
-           "punpckhhw  "#f10", "#f8", "#f6"        \n\t" \
-           "punpcklhw  "#f8", "#f8", "#f6"         \n\t"
+  "mov.d      "#f8", "#f2"                \n\t" \
+  "punpckhhw  "#f2", "#f0", "#f4"         \n\t" \
+  "punpcklhw  "#f0", "#f0", "#f4"         \n\t" \
+  "punpckhhw  "#f10", "#f8", "#f6"        \n\t" \
+  "punpcklhw  "#f8", "#f8", "#f6"         \n\t"
 
 #define MMI_XSawp_WD(f0, f2, f4, f6, f8, f10) \
-           "mov.d      "#f8", "#f2"                \n\t" \
-           "punpckhwd  "#f2", "#f0", "#f4"         \n\t" \
-           "punpcklwd  "#f0", "#f0", "#f4"         \n\t" \
-           "punpckhwd  "#f10", "#f8", "#f6"        \n\t" \
-           "punpcklwd  "#f8", "#f8", "#f6"         \n\t"
+  "mov.d      "#f8", "#f2"                \n\t" \
+  "punpckhwd  "#f2", "#f0", "#f4"         \n\t" \
+  "punpcklwd  "#f0", "#f0", "#f4"         \n\t" \
+  "punpckhwd  "#f10", "#f8", "#f6"        \n\t" \
+  "punpcklwd  "#f8", "#f8", "#f6"         \n\t"
 
 #define MMI_XSawp_DQ(f0, f2, f4, f6, f8, f10) \
-           "mov.d      "#f8", "#f2"                \n\t" \
-           "mov.d      "#f2", "#f4"                \n\t" \
-           "mov.d      "#f10", "#f6"               \n\t"
+  "mov.d      "#f8", "#f2"                \n\t" \
+  "mov.d      "#f2", "#f4"                \n\t" \
+  "mov.d      "#f10", "#f6"               \n\t"
 
 #define WELS_AbsH(f0, f2, f4, f6, f8, f10) \
-           "xor        "#f8", "#f8", "#f8"         \n\t" \
-           "psubh      "#f10", "#f8", "#f6"        \n\t" \
-           "psubh      "#f8", "#f8", "#f4"         \n\t" \
-           "pmaxsh     "#f0", "#f4", "#f8"         \n\t" \
-           "pmaxsh     "#f2", "#f6", "#f10"        \n\t"
+  "xor        "#f8", "#f8", "#f8"         \n\t" \
+  "psubh      "#f10", "#f8", "#f6"        \n\t" \
+  "psubh      "#f8", "#f8", "#f4"         \n\t" \
+  "pmaxsh     "#f0", "#f4", "#f8"         \n\t" \
+  "pmaxsh     "#f2", "#f6", "#f10"        \n\t"
 
 #define MMI_SumSub(f0, f2, f4, f6, f8, f10) \
-           "mov.d      "#f8", "#f4"                    \n\t" \
-           "mov.d      "#f10", "#f6"                   \n\t" \
-           "paddh      "#f4", "#f4", "#f0"             \n\t" \
-           "paddh      "#f6", "#f6", "#f2"             \n\t" \
-           "psubh      "#f0", "#f0", "#f8"             \n\t" \
-           "psubh      "#f2", "#f2", "#f10"            \n\t"
+  "mov.d      "#f8", "#f4"                    \n\t" \
+  "mov.d      "#f10", "#f6"                   \n\t" \
+  "paddh      "#f4", "#f4", "#f0"             \n\t" \
+  "paddh      "#f6", "#f6", "#f2"             \n\t" \
+  "psubh      "#f0", "#f0", "#f8"             \n\t" \
+  "psubh      "#f2", "#f2", "#f10"            \n\t"
 
 #define MMI_LoadDiff8P(f0, f2, f4, f6, f8, r0, r1) \
-           "gsldlc1    "#f0", 0x7("#r0")               \n\t" \
-           "gsldlc1    "#f4", 0x7("#r1")               \n\t" \
-           "gsldrc1    "#f0", 0x0("#r0")               \n\t" \
-           "gsldrc1    "#f4", 0x0("#r1")               \n\t" \
-           "punpckhbh  "#f2", "#f0", "#f8"             \n\t" \
-           "punpcklbh  "#f0", "#f0", "#f8"             \n\t" \
-           "punpckhbh  "#f6", "#f4", "#f8"             \n\t" \
-           "punpcklbh  "#f4", "#f4", "#f8"             \n\t" \
-           "psubh      "#f0", "#f0", "#f4"             \n\t" \
-           "psubh      "#f2", "#f2", "#f6"             \n\t"
+  "gsldlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gsldlc1    "#f4", 0x7("#r1")               \n\t" \
+  "gsldrc1    "#f0", 0x0("#r0")               \n\t" \
+  "gsldrc1    "#f4", 0x0("#r1")               \n\t" \
+  "punpckhbh  "#f2", "#f0", "#f8"             \n\t" \
+  "punpcklbh  "#f0", "#f0", "#f8"             \n\t" \
+  "punpckhbh  "#f6", "#f4", "#f8"             \n\t" \
+  "punpcklbh  "#f4", "#f4", "#f8"             \n\t" \
+  "psubh      "#f0", "#f0", "#f4"             \n\t" \
+  "psubh      "#f2", "#f2", "#f6"             \n\t"
 
 #define MMI_TransTwo4x4H(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
-           MMI_XSawp_HW(f0, f2, f4, f6, f16, f18)  \
-           MMI_XSawp_HW(f8, f10, f12, f14, f4, f6) \
-           MMI_XSawp_WD(f0, f2, f8, f10, f12, f14) \
-           MMI_XSawp_WD(f16, f18, f4, f6, f8, f10) \
-           MMI_XSawp_DQ(f0, f2, f16, f18, f4, f6)  \
-           MMI_XSawp_DQ(f12, f14, f8, f10, f16, f18)
+  MMI_XSawp_HW(f0, f2, f4, f6, f16, f18)  \
+  MMI_XSawp_HW(f8, f10, f12, f14, f4, f6) \
+  MMI_XSawp_WD(f0, f2, f8, f10, f12, f14) \
+  MMI_XSawp_WD(f16, f18, f4, f6, f8, f10) \
+  MMI_XSawp_DQ(f0, f2, f16, f18, f4, f6)  \
+  MMI_XSawp_DQ(f12, f14, f8, f10, f16, f18)
 
 #define MMI_TransTwo8x8B(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22, f24, f26, f28, f30, r0, r1) \
-           "dmfc1      "#r0", "#f28"                   \n\t" \
-           "dmfc1      "#r1", "#f30"                   \n\t" \
-           MMI_XSawp_BH(f0, f2, f4, f6, f28, f30)            \
-           MMI_XSawp_BH(f8, f10, f12, f14, f4, f6)           \
-           MMI_XSawp_BH(f16, f18, f20, f22, f12, f14)        \
-           "dmtc1      "#r0", "#f20"                   \n\t" \
-           "dmtc1      "#r1", "#f22"                   \n\t" \
-           "dmfc1      "#r0", "#f12"                   \n\t" \
-           "dmfc1      "#r1", "#f14"                   \n\t" \
-           MMI_XSawp_BH(f24, f26, f20, f22, f12, f14)        \
-           MMI_XSawp_HW(f0, f2, f8, f10, f20, f22)           \
-           MMI_XSawp_HW(f28, f30, f4, f6, f8, f10)           \
-           MMI_XSawp_HW(f16, f18, f24, f26, f4, f6)          \
-           "dmtc1      "#r0", "#f24"                   \n\t" \
-           "dmtc1      "#r1", "#f26"                   \n\t" \
-           "dmfc1      "#r0", "#f8"                    \n\t" \
-           "dmfc1      "#r1", "#f10"                   \n\t" \
-           MMI_XSawp_HW(f24, f26, f12, f14, f8, f10)         \
-           MMI_XSawp_WD(f0, f2, f16, f18, f12, f14)          \
-           MMI_XSawp_WD(f20, f22, f4, f6, f16, f18)          \
-           MMI_XSawp_WD(f28, f30, f24, f26, f4, f6)          \
-           "dmtc1      "#r0", "#f24"                   \n\t" \
-           "dmtc1      "#r1", "#f26"                   \n\t" \
-           "dmfc1      "#r0", "#f16"                   \n\t" \
-           "dmfc1      "#r1", "#f18"                   \n\t" \
-           MMI_XSawp_WD(f24, f26, f8, f10, f16, f18)         \
-           MMI_XSawp_DQ(f0, f2, f28, f30, f8, f10)           \
-           MMI_XSawp_DQ(f12, f14, f4, f6, f28, f30)          \
-           MMI_XSawp_DQ(f20, f22, f24, f26, f4, f6)          \
-           "dmtc1      "#r0", "#f24"                   \n\t" \
-           "dmtc1      "#r1", "#f26"                   \n\t" \
-           "dmfc1      "#r0", "#f0"                    \n\t" \
-           "dmfc1      "#r1", "#f2"                    \n\t" \
-           MMI_XSawp_DQ(f24, f26, f16, f18, f0, f2)          \
-           "dmtc1      "#r0", "#f16"                   \n\t" \
-           "dmtc1      "#r1", "#f18"                   \n\t"
+  "dmfc1      "#r0", "#f28"                   \n\t" \
+  "dmfc1      "#r1", "#f30"                   \n\t" \
+  MMI_XSawp_BH(f0, f2, f4, f6, f28, f30)            \
+  MMI_XSawp_BH(f8, f10, f12, f14, f4, f6)           \
+  MMI_XSawp_BH(f16, f18, f20, f22, f12, f14)        \
+  "dmtc1      "#r0", "#f20"                   \n\t" \
+  "dmtc1      "#r1", "#f22"                   \n\t" \
+  "dmfc1      "#r0", "#f12"                   \n\t" \
+  "dmfc1      "#r1", "#f14"                   \n\t" \
+  MMI_XSawp_BH(f24, f26, f20, f22, f12, f14)        \
+  MMI_XSawp_HW(f0, f2, f8, f10, f20, f22)           \
+  MMI_XSawp_HW(f28, f30, f4, f6, f8, f10)           \
+  MMI_XSawp_HW(f16, f18, f24, f26, f4, f6)          \
+  "dmtc1      "#r0", "#f24"                   \n\t" \
+  "dmtc1      "#r1", "#f26"                   \n\t" \
+  "dmfc1      "#r0", "#f8"                    \n\t" \
+  "dmfc1      "#r1", "#f10"                   \n\t" \
+  MMI_XSawp_HW(f24, f26, f12, f14, f8, f10)         \
+  MMI_XSawp_WD(f0, f2, f16, f18, f12, f14)          \
+  MMI_XSawp_WD(f20, f22, f4, f6, f16, f18)          \
+  MMI_XSawp_WD(f28, f30, f24, f26, f4, f6)          \
+  "dmtc1      "#r0", "#f24"                   \n\t" \
+  "dmtc1      "#r1", "#f26"                   \n\t" \
+  "dmfc1      "#r0", "#f16"                   \n\t" \
+  "dmfc1      "#r1", "#f18"                   \n\t" \
+  MMI_XSawp_WD(f24, f26, f8, f10, f16, f18)         \
+  MMI_XSawp_DQ(f0, f2, f28, f30, f8, f10)           \
+  MMI_XSawp_DQ(f12, f14, f4, f6, f28, f30)          \
+  MMI_XSawp_DQ(f20, f22, f24, f26, f4, f6)          \
+  "dmtc1      "#r0", "#f24"                   \n\t" \
+  "dmtc1      "#r1", "#f26"                   \n\t" \
+  "dmfc1      "#r0", "#f0"                    \n\t" \
+  "dmfc1      "#r1", "#f2"                    \n\t" \
+  MMI_XSawp_DQ(f24, f26, f16, f18, f0, f2)          \
+  "dmtc1      "#r0", "#f16"                   \n\t" \
+  "dmtc1      "#r1", "#f18"                   \n\t"
 
 #define MMI_XSwap_HW_SINGLE(f0, f2, f4) \
-           "mov.d      "#f4", "#f0"                    \n\t" \
-           "punpckhhw  "#f4", "#f4", "#f2"             \n\t" \
-           "punpcklhw  "#f0", "#f0", "#f2"             \n\t"
+  "mov.d      "#f4", "#f0"                    \n\t" \
+  "punpckhhw  "#f4", "#f4", "#f2"             \n\t" \
+  "punpcklhw  "#f0", "#f0", "#f2"             \n\t"
 
 #define MMI_XSwap_WD_SINGLE(f0, f2, f4) \
-           "mov.d      "#f4", "#f0"                    \n\t" \
-           "punpckhwd  "#f4", "#f4", "#f2"             \n\t" \
-           "punpcklwd  "#f0", "#f0", "#f2"             \n\t"
+  "mov.d      "#f4", "#f0"                    \n\t" \
+  "punpckhwd  "#f4", "#f4", "#f2"             \n\t" \
+  "punpcklwd  "#f0", "#f0", "#f2"             \n\t"
 
 #define MMI_Trans4x4H_SINGLE(f0, f2, f4, f6, f8) \
-           MMI_XSwap_HW_SINGLE(f0, f2, f8)              \
-           MMI_XSwap_HW_SINGLE(f4, f6, f2)              \
-           MMI_XSwap_WD_SINGLE(f0, f4, f6)              \
-           MMI_XSwap_WD_SINGLE(f8, f2, f4)
+  MMI_XSwap_HW_SINGLE(f0, f2, f8)              \
+  MMI_XSwap_HW_SINGLE(f4, f6, f2)              \
+  MMI_XSwap_WD_SINGLE(f0, f4, f6)              \
+  MMI_XSwap_WD_SINGLE(f8, f2, f4)
 
 #define MMI_SumSub_SINGLE(f0, f2, f4) \
-           "mov.d      "#f4", "#f2"                    \n\t" \
-           "psubh      "#f2", "#f2", "#f0"             \n\t" \
-           "paddh      "#f0", "#f0", "#f4"             \n\t"
+  "mov.d      "#f4", "#f2"                    \n\t" \
+  "psubh      "#f2", "#f2", "#f0"             \n\t" \
+  "paddh      "#f0", "#f0", "#f4"             \n\t"
 
 #define MMI_SumSubMul2_SINGLE(f0, f2, f4, f6) \
-           "mov.d      "#f4", "#f0"                    \n\t" \
-           "psllh      "#f0", "#f0", "#f6"             \n\t" \
-           "paddh      "#f0", "#f0", "#f2"             \n\t" \
-           "psllh      "#f2", "#f2", "#f6"             \n\t" \
-           "psubh      "#f4", "#f4", "#f2"             \n\t"
+  "mov.d      "#f4", "#f0"                    \n\t" \
+  "psllh      "#f0", "#f0", "#f6"             \n\t" \
+  "paddh      "#f0", "#f0", "#f2"             \n\t" \
+  "psllh      "#f2", "#f2", "#f6"             \n\t" \
+  "psubh      "#f4", "#f4", "#f2"             \n\t"
 
 //f4 should be 0x0
 #define MMI_Copy8Times(f0, f2, f4, r0) \
-           "dmtc1      "#r0", "#f0"                    \n\t" \
-           "pshufh     "#f0", "#f0", "#f4"             \n\t" \
-           "mov.d      "#f2", "#f0"                    \n\t"
+  "dmtc1      "#r0", "#f0"                    \n\t" \
+  "pshufh     "#f0", "#f0", "#f4"             \n\t" \
+  "mov.d      "#f2", "#f0"                    \n\t"
 
 //f4 should be 0x0
 #define MMI_Copy16Times(f0, f2, f4, r0) \
-           "dmtc1      "#r0", "#f0"                    \n\t" \
-           "punpcklbh  "#f0", "#f0", "#f0"             \n\t" \
-           "pshufh     "#f0", "#f0", "#f4"             \n\t" \
-           "mov.d      "#f2", "#f0"                    \n\t"
+  "dmtc1      "#r0", "#f0"                    \n\t" \
+  "punpcklbh  "#f0", "#f0", "#f0"             \n\t" \
+  "pshufh     "#f0", "#f0", "#f4"             \n\t" \
+  "mov.d      "#f2", "#f0"                    \n\t"
 
 #define MMI_SumSubDiv2_SINGLE(f0, f2, f4, f6) \
-           "psrah      "#f4", "#f2", "#f6"             \n\t" \
-           "paddh      "#f4", "#f4", "#f0"             \n\t" \
-           "psrah      "#f0", "#f0", "#f6"             \n\t" \
-           "psubh      "#f0", "#f0", "#f2"             \n\t"
+  "psrah      "#f4", "#f2", "#f6"             \n\t" \
+  "paddh      "#f4", "#f4", "#f0"             \n\t" \
+  "psrah      "#f0", "#f0", "#f6"             \n\t" \
+  "psubh      "#f0", "#f0", "#f2"             \n\t"
 
 #define MMI_IDCT_SINGLE(f0, f2, f4, f6, f8, f10, f12) \
-           MMI_SumSub_SINGLE(f6, f8, f10)             \
-           MMI_SumSubDiv2_SINGLE(f4, f2, f0, f12)     \
-           MMI_SumSub_SINGLE(f0, f6, f10)             \
-           MMI_SumSub_SINGLE(f4, f8, f10)
+  MMI_SumSub_SINGLE(f6, f8, f10)             \
+  MMI_SumSubDiv2_SINGLE(f4, f2, f0, f12)     \
+  MMI_SumSub_SINGLE(f0, f6, f10)             \
+  MMI_SumSub_SINGLE(f4, f8, f10)
 
 #define MMI_StoreDiff4P_SINGLE(f0, f2, f4, f6, r0, r1, f8) \
-           "gsldlc1    "#f2", 0x7("#r1")               \n\t" \
-           "gsldrc1    "#f2", 0x0("#r1")               \n\t" \
-           "punpcklbh  "#f2", "#f2", "#f6"             \n\t" \
-           "paddh      "#f0", "#f0", "#f4"             \n\t" \
-           "psrah      "#f0", "#f0", "#f8"             \n\t" \
-           "paddsh     "#f0", "#f0", "#f2"             \n\t" \
-           "packushb   "#f0", "#f0", "#f2"             \n\t" \
-           "gsswlc1    "#f0", 0x3("#r0")               \n\t" \
-           "gsswrc1    "#f0", 0x0("#r0")               \n\t"
+  "gsldlc1    "#f2", 0x7("#r1")               \n\t" \
+  "gsldrc1    "#f2", 0x0("#r1")               \n\t" \
+  "punpcklbh  "#f2", "#f2", "#f6"             \n\t" \
+  "paddh      "#f0", "#f0", "#f4"             \n\t" \
+  "psrah      "#f0", "#f0", "#f8"             \n\t" \
+  "paddsh     "#f0", "#f0", "#f2"             \n\t" \
+  "packushb   "#f0", "#f0", "#f2"             \n\t" \
+  "gsswlc1    "#f0", 0x3("#r0")               \n\t" \
+  "gsswrc1    "#f0", 0x0("#r0")               \n\t"
 
 #define SUMH_HORIZON(f0, f2, f4, f6, f8) \
-           "paddh      "#f0", "#f0", "#f2"                       \n\t" \
-           "punpckhhw  "#f2", "#f0", "#f8"                       \n\t" \
-           "punpcklhw  "#f0", "#f0", "#f8"                       \n\t" \
-           "paddw      "#f0", "#f0", "#f2"                       \n\t" \
-           "punpckhwd  "#f2", "#f0", "#f0"                       \n\t" \
-           "paddw      "#f0", "#f0", "#f2"                       \n\t"
+  "paddh      "#f0", "#f0", "#f2"                       \n\t" \
+  "punpckhhw  "#f2", "#f0", "#f8"                       \n\t" \
+  "punpcklhw  "#f0", "#f0", "#f8"                       \n\t" \
+  "paddw      "#f0", "#f0", "#f2"                       \n\t" \
+  "punpckhwd  "#f2", "#f0", "#f0"                       \n\t" \
+  "paddw      "#f0", "#f0", "#f2"                       \n\t"
 
 #define LOAD_COLUMN(f0, f2, f4, f6, f8, f10, f12, f14, r0, r1, r2) \
-           "daddu      "#r2", "#r0", "#r1"                       \n\t" \
-           "gsldlc1    "#f0", 0x7("#r0")                         \n\t" \
-           "gsldlc1    "#f4", 0x7("#r2")                         \n\t" \
-           "gsldrc1    "#f0", 0x0("#r0")                         \n\t" \
-           "gsldrc1    "#f4", 0x0("#r2")                         \n\t" \
-           "punpcklbh  "#f0", "#f0", "#f4"                       \n\t" \
-           "daddu      "#r0", "#r2", "#r1"                       \n\t" \
-           "daddu      "#r2", "#r0", "#r1"                       \n\t" \
-           "gsldlc1    "#f8", 0x7("#r0")                         \n\t" \
-           "gsldlc1    "#f4", 0x7("#r2")                         \n\t" \
-           "gsldrc1    "#f8", 0x0("#r0")                         \n\t" \
-           "gsldrc1    "#f4", 0x0("#r2")                         \n\t" \
-           "punpcklbh  "#f8", "#f8", "#f4"                       \n\t" \
-           "punpckhhw  "#f2", "#f0", "#f8"                       \n\t" \
-           "punpcklhw  "#f0", "#f0", "#f8"                       \n\t" \
-           "daddu      "#r0", "#r2", "#r1"                       \n\t" \
-           "daddu      "#r2", "#r0", "#r1"                       \n\t" \
-           "gsldlc1    "#f12", 0x7("#r0")                        \n\t" \
-           "gsldlc1    "#f4", 0x7("#r2")                         \n\t" \
-           "gsldrc1    "#f12", 0x0("#r0")                        \n\t" \
-           "gsldrc1    "#f4", 0x0("#r2")                         \n\t" \
-           "punpcklbh  "#f12", "#f12", "#f4"                     \n\t" \
-           "daddu      "#r0", "#r2", "#r1"                       \n\t" \
-           "daddu      "#r2", "#r0", "#r1"                       \n\t" \
-           "gsldlc1    "#f8", 0x7("#r0")                         \n\t" \
-           "gsldlc1    "#f4", 0x7("#r2")                         \n\t" \
-           "gsldrc1    "#f8", 0x0("#r0")                         \n\t" \
-           "gsldrc1    "#f4", 0x0("#r2")                         \n\t" \
-           "punpcklbh  "#f8", "#f8", "#f4"                       \n\t" \
-           "punpckhhw  "#f14", "#f12", "#f8"                     \n\t" \
-           "punpcklhw  "#f12", "#f12", "#f8"                     \n\t" \
-           "daddu      "#r0", "#r2", "#r1"                       \n\t" \
-           "punpcklwd  "#f0", "#f2", "#f14"                      \n\t" \
-           "punpckhwd  "#f2", "#f2", "#f14"                      \n\t"
+  "daddu      "#r2", "#r0", "#r1"                       \n\t" \
+  "gsldlc1    "#f0", 0x7("#r0")                         \n\t" \
+  "gsldlc1    "#f4", 0x7("#r2")                         \n\t" \
+  "gsldrc1    "#f0", 0x0("#r0")                         \n\t" \
+  "gsldrc1    "#f4", 0x0("#r2")                         \n\t" \
+  "punpcklbh  "#f0", "#f0", "#f4"                       \n\t" \
+  "daddu      "#r0", "#r2", "#r1"                       \n\t" \
+  "daddu      "#r2", "#r0", "#r1"                       \n\t" \
+  "gsldlc1    "#f8", 0x7("#r0")                         \n\t" \
+  "gsldlc1    "#f4", 0x7("#r2")                         \n\t" \
+  "gsldrc1    "#f8", 0x0("#r0")                         \n\t" \
+  "gsldrc1    "#f4", 0x0("#r2")                         \n\t" \
+  "punpcklbh  "#f8", "#f8", "#f4"                       \n\t" \
+  "punpckhhw  "#f2", "#f0", "#f8"                       \n\t" \
+  "punpcklhw  "#f0", "#f0", "#f8"                       \n\t" \
+  "daddu      "#r0", "#r2", "#r1"                       \n\t" \
+  "daddu      "#r2", "#r0", "#r1"                       \n\t" \
+  "gsldlc1    "#f12", 0x7("#r0")                        \n\t" \
+  "gsldlc1    "#f4", 0x7("#r2")                         \n\t" \
+  "gsldrc1    "#f12", 0x0("#r0")                        \n\t" \
+  "gsldrc1    "#f4", 0x0("#r2")                         \n\t" \
+  "punpcklbh  "#f12", "#f12", "#f4"                     \n\t" \
+  "daddu      "#r0", "#r2", "#r1"                       \n\t" \
+  "daddu      "#r2", "#r0", "#r1"                       \n\t" \
+  "gsldlc1    "#f8", 0x7("#r0")                         \n\t" \
+  "gsldlc1    "#f4", 0x7("#r2")                         \n\t" \
+  "gsldrc1    "#f8", 0x0("#r0")                         \n\t" \
+  "gsldrc1    "#f4", 0x0("#r2")                         \n\t" \
+  "punpcklbh  "#f8", "#f8", "#f4"                       \n\t" \
+  "punpckhhw  "#f14", "#f12", "#f8"                     \n\t" \
+  "punpcklhw  "#f12", "#f12", "#f8"                     \n\t" \
+  "daddu      "#r0", "#r2", "#r1"                       \n\t" \
+  "punpcklwd  "#f0", "#f2", "#f14"                      \n\t" \
+  "punpckhwd  "#f2", "#f2", "#f14"                      \n\t"
 
 #define LOAD_COLUMN_C(f0, f2, f4, f6, r0, r1, r2) \
-           "daddu      "#r2", "#r0", "#r1"                       \n\t" \
-           "gsldlc1    "#f0", 0x7("#r0")                         \n\t" \
-           "gsldlc1    "#f2", 0x7("#r2")                         \n\t" \
-           "gsldrc1    "#f0", 0x0("#r0")                         \n\t" \
-           "gsldrc1    "#f2", 0x0("#r2")                         \n\t" \
-           "punpcklbh  "#f0", "#f0", "#f2"                       \n\t" \
-           "daddu      "#r0", "#r2", "#r1"                       \n\t" \
-           "daddu      "#r2", "#r0", "#r1"                       \n\t" \
-           "gsldlc1    "#f4", 0x7("#r0")                         \n\t" \
-           "gsldlc1    "#f2", 0x7("#r2")                         \n\t" \
-           "gsldrc1    "#f4", 0x0("#r0")                         \n\t" \
-           "gsldrc1    "#f2", 0x0("#r2")                         \n\t" \
-           "punpcklbh  "#f4", "#f4", "#f2"                       \n\t" \
-           "punpckhhw  "#f0", "#f0", "#f4"                       \n\t" \
-           "daddu      "#r0", "#r2", "#r1"                       \n\t"
+  "daddu      "#r2", "#r0", "#r1"                       \n\t" \
+  "gsldlc1    "#f0", 0x7("#r0")                         \n\t" \
+  "gsldlc1    "#f2", 0x7("#r2")                         \n\t" \
+  "gsldrc1    "#f0", 0x0("#r0")                         \n\t" \
+  "gsldrc1    "#f2", 0x0("#r2")                         \n\t" \
+  "punpcklbh  "#f0", "#f0", "#f2"                       \n\t" \
+  "daddu      "#r0", "#r2", "#r1"                       \n\t" \
+  "daddu      "#r2", "#r0", "#r1"                       \n\t" \
+  "gsldlc1    "#f4", 0x7("#r0")                         \n\t" \
+  "gsldlc1    "#f2", 0x7("#r2")                         \n\t" \
+  "gsldrc1    "#f4", 0x0("#r0")                         \n\t" \
+  "gsldrc1    "#f2", 0x0("#r2")                         \n\t" \
+  "punpcklbh  "#f4", "#f4", "#f2"                       \n\t" \
+  "punpckhhw  "#f0", "#f0", "#f4"                       \n\t" \
+  "daddu      "#r0", "#r2", "#r1"                       \n\t"
+
 /**
  * backup register
  */
 #define BACKUP_REG \
-           double __back_temp[8];                                      \
-           if (_MIPS_SIM == _ABI64)                                    \
-           __asm__ volatile (                                          \
-             "gssqc1       $f25,      $f24,       0x00(%[temp])  \n\t" \
-             "gssqc1       $f27,      $f26,       0x10(%[temp])  \n\t" \
-             "gssqc1       $f29,      $f28,       0x20(%[temp])  \n\t" \
-             "gssqc1       $f31,      $f30,       0x30(%[temp])  \n\t" \
-             :                                                         \
-             : [temp]"r"(__back_temp)                                  \
-             : "memory"                                                \
-           );                                                          \
-          else                                                         \
-           __asm__ volatile (                                          \
-             "gssqc1       $f22,      $f20,       0x00(%[temp])  \n\t" \
-             "gssqc1       $f26,      $f24,       0x10(%[temp])  \n\t" \
-             "gssqc1       $f30,      $f28,       0x20(%[temp])  \n\t" \
-             :                                                         \
-             : [temp]"r"(__back_temp)                                  \
-             : "memory"                                                \
-           );
+   double __back_temp[8];                                      \
+   if (_MIPS_SIM == _ABI64)                                    \
+   __asm__ volatile (                                          \
+     "gssqc1       $f25,      $f24,       0x00(%[temp])  \n\t" \
+     "gssqc1       $f27,      $f26,       0x10(%[temp])  \n\t" \
+     "gssqc1       $f29,      $f28,       0x20(%[temp])  \n\t" \
+     "gssqc1       $f31,      $f30,       0x30(%[temp])  \n\t" \
+     :                                                         \
+     : [temp]"r"(__back_temp)                                  \
+     : "memory"                                                \
+   );                                                          \
+  else                                                         \
+   __asm__ volatile (                                          \
+     "gssqc1       $f22,      $f20,       0x00(%[temp])  \n\t" \
+     "gssqc1       $f26,      $f24,       0x10(%[temp])  \n\t" \
+     "gssqc1       $f30,      $f28,       0x20(%[temp])  \n\t" \
+     :                                                         \
+     : [temp]"r"(__back_temp)                                  \
+     : "memory"                                                \
+   );
 
 /**
  * recover register
  */
 #define RECOVER_REG \
-           if (_MIPS_SIM == _ABI64)                                    \
-           __asm__ volatile (                                          \
-             "gslqc1       $f25,      $f24,       0x00(%[temp])  \n\t" \
-             "gslqc1       $f27,      $f26,       0x10(%[temp])  \n\t" \
-             "gslqc1       $f29,      $f28,       0x20(%[temp])  \n\t" \
-             "gslqc1       $f31,      $f30,       0x30(%[temp])  \n\t" \
-             :                                                         \
-             : [temp]"r"(__back_temp)                                  \
-             : "memory"                                                \
-           );                                                          \
-           else                                                        \
-           __asm__ volatile (                                          \
-             "gslqc1       $f22,      $f20,       0x00(%[temp])  \n\t" \
-             "gslqc1       $f26,      $f24,       0x10(%[temp])  \n\t" \
-             "gslqc1       $f30,      $f28,       0x20(%[temp])  \n\t" \
-             :                                                         \
-             : [temp]"r"(__back_temp)                                  \
-             : "memory"                                                \
-           );
+   if (_MIPS_SIM == _ABI64)                                    \
+   __asm__ volatile (                                          \
+     "gslqc1       $f25,      $f24,       0x00(%[temp])  \n\t" \
+     "gslqc1       $f27,      $f26,       0x10(%[temp])  \n\t" \
+     "gslqc1       $f29,      $f28,       0x20(%[temp])  \n\t" \
+     "gslqc1       $f31,      $f30,       0x30(%[temp])  \n\t" \
+     :                                                         \
+     : [temp]"r"(__back_temp)                                  \
+     : "memory"                                                \
+   );                                                          \
+   else                                                        \
+   __asm__ volatile (                                          \
+     "gslqc1       $f22,      $f20,       0x00(%[temp])  \n\t" \
+     "gslqc1       $f26,      $f24,       0x10(%[temp])  \n\t" \
+     "gslqc1       $f30,      $f28,       0x20(%[temp])  \n\t" \
+     :                                                         \
+     : [temp]"r"(__back_temp)                                  \
+     : "memory"                                                \
+   );
 
 # define OK             1
 # define NOTOK          0
--- a/codec/common/inc/copy_mb.h
+++ b/codec/common/inc/copy_mb.h
@@ -75,6 +75,13 @@
 void WelsCopy8x16_AArch64_neon (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
 #endif
 
+#if defined (HAVE_MMI)
+void WelsCopy8x8_mmi (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+void WelsCopy8x16_mmi (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+void WelsCopy16x8NotAligned_mmi (uint8_t* Dst, int32_t  iStrideD, uint8_t* Src, int32_t  iStrideS);
+void WelsCopy16x16_mmi (uint8_t* Dst, int32_t  iStrideD, uint8_t* Src, int32_t  iStrideS);
+void WelsCopy16x16NotAligned_mmi (uint8_t* Dst, int32_t  iStrideD, uint8_t* Src, int32_t  iStrideS);
+#endif//HAVE_MMI
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
--- a/codec/common/inc/expand_pic.h
+++ b/codec/common/inc/expand_pic.h
@@ -73,6 +73,15 @@
                                        const int32_t kiPicH);
 #endif
 
+#if defined(HAVE_MMI)
+void ExpandPictureLuma_mmi (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicW,
+                            const int32_t kiPicH);
+void ExpandPictureChromaAlign_mmi (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicW,
+                                   const int32_t kiPicH);
+void ExpandPictureChromaUnalign_mmi (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicW,
+                                     const int32_t kiPicH);
+#endif//HAVE_MMI
+
 typedef void (*PExpandPictureFunc) (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH);
 
 typedef struct TagExpandPicFunc {
--- a/codec/common/inc/intra_pred_common.h
+++ b/codec/common/inc/intra_pred_common.h
@@ -67,6 +67,11 @@
 void WelsI16x16LumaPredV_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 void WelsI16x16LumaPredH_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 #endif//HAVE_NEON_AARCH64
+
+#if defined(HAVE_MMI)
+void WelsI16x16LumaPredV_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredH_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+#endif//HAVE_MMI
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
--- a/codec/common/inc/sad_common.h
+++ b/codec/common/inc/sad_common.h
@@ -104,6 +104,19 @@
 void WelsSampleSadFour8x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
 void WelsSampleSadFour4x4_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
 #endif
+
+#if defined (HAVE_MMI)
+int32_t WelsSampleSad4x4_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad16x16_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad16x8_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad8x16_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad8x8_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+
+void WelsSampleSadFour16x16_mmi (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour16x8_mmi (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour8x16_mmi (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour8x8_mmi (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+#endif//HAVE_MMI
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
--- /dev/null
+++ b/codec/common/mips/copy_mb_mmi.c
@@ -1,0 +1,477 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2018, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file    copy_mb_mmi.c
+ *
+ * \brief   Loongson optimization
+ *
+ * \date    20/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+void WelsCopy8x8_mmi(uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc,
+                     int32_t  iStrideS ) {
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
+    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f2, 0x7($8)                    \n\t"
+    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f2, 0x0($8)                    \n\t"
+    PTR_ADDU   "%[pSrc], $8, %[iStrideS]        \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
+    "gsldlc1    $f4, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f6, 0x7($8)                    \n\t"
+    "gsldrc1    $f4, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f6, 0x0($8)                    \n\t"
+    PTR_ADDU   "%[pSrc], $8, %[iStrideS]        \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
+    "gsldlc1    $f8, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f10, 0x7($8)                   \n\t"
+    "gsldrc1    $f8, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f10, 0x0($8)                   \n\t"
+    PTR_ADDU   "%[pSrc], $8, %[iStrideS]        \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
+    "gsldlc1    $f12, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f14, 0x7($8)                   \n\t"
+    "gsldrc1    $f12, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f14, 0x0($8)                   \n\t"
+
+    PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
+    "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
+    "gssdlc1    $f2, 0x7($8)                    \n\t"
+    "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
+    "gssdrc1    $f2, 0x0($8)                    \n\t"
+    PTR_ADDU   "%[pDst], $8, %[iStrideD]        \n\t"
+    PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
+    "gssdlc1    $f4, 0x7(%[pDst])               \n\t"
+    "gssdlc1    $f6, 0x7($8)                    \n\t"
+    "gssdrc1    $f4, 0x0(%[pDst])               \n\t"
+    "gssdrc1    $f6, 0x0($8)                    \n\t"
+    PTR_ADDU   "%[pDst], $8, %[iStrideD]        \n\t"
+    PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
+    "gssdlc1    $f8, 0x7(%[pDst])               \n\t"
+    "gssdlc1    $f10, 0x7($8)                   \n\t"
+    "gssdrc1    $f8, 0x0(%[pDst])               \n\t"
+    "gssdrc1    $f10, 0x0($8)                   \n\t"
+    PTR_ADDU   "%[pDst], $8, %[iStrideD]        \n\t"
+    PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
+    "gssdlc1    $f12, 0x7(%[pDst])              \n\t"
+    "gssdlc1    $f14, 0x7($8)                   \n\t"
+    "gssdrc1    $f12, 0x0(%[pDst])              \n\t"
+    "gssdrc1    $f14, 0x0($8)                   \n\t"
+   : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
+   : [iStrideD]"r"(iStrideD), [iStrideS]"r"(iStrideS)
+   : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
+  );
+}
+
+void WelsCopy8x16_mmi(uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc,
+                      int32_t iStrideS) {
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
+    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f2, 0x7($8)                    \n\t"
+    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f2, 0x0($8)                    \n\t"
+    PTR_ADDU   "%[pSrc], $8, %[iStrideS]        \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
+    "gsldlc1    $f4, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f6, 0x7($8)                    \n\t"
+    "gsldrc1    $f4, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f6, 0x0($8)                    \n\t"
+    PTR_ADDU   "%[pSrc], $8, %[iStrideS]        \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
+    "gsldlc1    $f8, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f10, 0x7($8)                   \n\t"
+    "gsldrc1    $f8, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f10, 0x0($8)                   \n\t"
+    PTR_ADDU   "%[pSrc], $8, %[iStrideS]        \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
+    "gsldlc1    $f12, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f14, 0x7($8)                   \n\t"
+    "gsldrc1    $f12, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f14, 0x0($8)                   \n\t"
+    PTR_ADDU   "%[pSrc], $8, %[iStrideS]        \n\t"
+
+    PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
+    "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
+    "gssdlc1    $f2, 0x7($8)                    \n\t"
+    "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
+    "gssdrc1    $f2, 0x0($8)                    \n\t"
+    PTR_ADDU   "%[pDst], $8, %[iStrideD]        \n\t"
+    PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
+    "gssdlc1    $f4, 0x7(%[pDst])               \n\t"
+    "gssdlc1    $f6, 0x7($8)                    \n\t"
+    "gssdrc1    $f4, 0x0(%[pDst])               \n\t"
+    "gssdrc1    $f6, 0x0($8)                    \n\t"
+    PTR_ADDU   "%[pDst], $8, %[iStrideD]        \n\t"
+    PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
+    "gssdlc1    $f8, 0x7(%[pDst])               \n\t"
+    "gssdlc1    $f10, 0x7($8)                   \n\t"
+    "gssdrc1    $f8, 0x0(%[pDst])               \n\t"
+    "gssdrc1    $f10, 0x0($8)                   \n\t"
+    PTR_ADDU   "%[pDst], $8, %[iStrideD]        \n\t"
+    PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
+    "gssdlc1    $f12, 0x7(%[pDst])              \n\t"
+    "gssdlc1    $f14, 0x7($8)                   \n\t"
+    "gssdrc1    $f12, 0x0(%[pDst])              \n\t"
+    "gssdrc1    $f14, 0x0($8)                   \n\t"
+    PTR_ADDU   "%[pDst], $8, %[iStrideD]        \n\t"
+
+    PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
+    "gsldlc1    $f0, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f2, 0x7($8)                    \n\t"
+    "gsldrc1    $f0, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f2, 0x0($8)                    \n\t"
+    PTR_ADDU   "%[pSrc], $8, %[iStrideS]        \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
+    "gsldlc1    $f4, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f6, 0x7($8)                    \n\t"
+    "gsldrc1    $f4, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f6, 0x0($8)                    \n\t"
+    PTR_ADDU   "%[pSrc], $8, %[iStrideS]        \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
+    "gsldlc1    $f8, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f10, 0x7($8)                   \n\t"
+    "gsldrc1    $f8, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f10, 0x0($8)                   \n\t"
+    PTR_ADDU   "%[pSrc], $8, %[iStrideS]        \n\t"
+    PTR_ADDU   "$8, %[pSrc], %[iStrideS]        \n\t"
+    "gsldlc1    $f12, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f14, 0x7($8)                   \n\t"
+    "gsldrc1    $f12, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f14, 0x0($8)                   \n\t"
+
+    PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
+    "gssdlc1    $f0, 0x7(%[pDst])               \n\t"
+    "gssdlc1    $f2, 0x7($8)                    \n\t"
+    "gssdrc1    $f0, 0x0(%[pDst])               \n\t"
+    "gssdrc1    $f2, 0x0($8)                    \n\t"
+    PTR_ADDU   "%[pDst], $8, %[iStrideD]        \n\t"
+    PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
+    "gssdlc1    $f4, 0x7(%[pDst])               \n\t"
+    "gssdlc1    $f6, 0x7($8)                    \n\t"
+    "gssdrc1    $f4, 0x0(%[pDst])               \n\t"
+    "gssdrc1    $f6, 0x0($8)                    \n\t"
+    PTR_ADDU   "%[pDst], $8, %[iStrideD]        \n\t"
+    PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
+    "gssdlc1    $f8, 0x7(%[pDst])               \n\t"
+    "gssdlc1    $f10, 0x7($8)                   \n\t"
+    "gssdrc1    $f8, 0x0(%[pDst])               \n\t"
+    "gssdrc1    $f10, 0x0($8)                   \n\t"
+    PTR_ADDU   "%[pDst], $8, %[iStrideD]        \n\t"
+    PTR_ADDU   "$8, %[pDst], %[iStrideD]        \n\t"
+    "gssdlc1    $f12, 0x7(%[pDst])              \n\t"
+    "gssdlc1    $f14, 0x7($8)                   \n\t"
+    "gssdrc1    $f12, 0x0(%[pDst])              \n\t"
+    "gssdrc1    $f14, 0x0($8)                   \n\t"
+   : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
+   : [iStrideD]"r"(iStrideD), [iStrideS]"r"(iStrideS)
+   : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
+  );
+}
+
+void WelsCopy16x16_mmi(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrc,
+                       int32_t iSrcStride) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "gslqc1     $f0, $f2, 0x0(%[pSrc])          \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gslqc1     $f4, $f6, 0x0(%[pSrc])          \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gslqc1     $f8, $f10, 0x0(%[pSrc])         \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gslqc1     $f12, $f14, 0x0(%[pSrc])        \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gslqc1     $f16, $f18, 0x0(%[pSrc])        \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gslqc1     $f20, $f22, 0x0(%[pSrc])        \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gslqc1     $f24, $f26, 0x0(%[pSrc])        \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gslqc1     $f28, $f30, 0x0(%[pSrc])        \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+
+    "gssqc1     $f0, $f2, 0x0(%[pDst])          \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f4, $f6, 0x0(%[pDst])          \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f8, $f10, 0x0(%[pDst])         \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f12, $f14, 0x0(%[pDst])        \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f16, $f18, 0x0(%[pDst])        \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f20, $f22, 0x0(%[pDst])        \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f24, $f26, 0x0(%[pDst])        \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f28, $f30, 0x0(%[pDst])        \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+
+    "gslqc1     $f0, $f2, 0x0(%[pSrc])          \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gslqc1     $f4, $f6, 0x0(%[pSrc])          \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gslqc1     $f8, $f10, 0x0(%[pSrc])         \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gslqc1     $f12, $f14, 0x0(%[pSrc])        \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gslqc1     $f16, $f18, 0x0(%[pSrc])        \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gslqc1     $f20, $f22, 0x0(%[pSrc])        \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gslqc1     $f24, $f26, 0x0(%[pSrc])        \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gslqc1     $f28, $f30, 0x0(%[pSrc])        \n\t"
+
+    "gssqc1     $f0, $f2, 0x0(%[pDst])          \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f4, $f6, 0x0(%[pDst])          \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f8, $f10, 0x0(%[pDst])         \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f12, $f14, 0x0(%[pDst])        \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f16, $f18, 0x0(%[pDst])        \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f20, $f22, 0x0(%[pDst])        \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f24, $f26, 0x0(%[pDst])        \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f28, $f30, 0x0(%[pDst])        \n\t"
+   : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
+   : [iDstStride]"r"((int)iDstStride), [iSrcStride]"r"((int)iSrcStride)
+   : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+     "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void WelsCopy16x16NotAligned_mmi(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrc,
+                                 int32_t iSrcStride) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "gsldlc1    $f2, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f0, 0xF(%[pSrc])               \n\t"
+    "gsldrc1    $f2, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f0, 0x8(%[pSrc])               \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f6, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f4, 0xF(%[pSrc])               \n\t"
+    "gsldrc1    $f6, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f4, 0x8(%[pSrc])               \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f10, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f8, 0xF(%[pSrc])               \n\t"
+    "gsldrc1    $f10, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f8, 0x8(%[pSrc])               \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f14, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f12, 0xF(%[pSrc])              \n\t"
+    "gsldrc1    $f14, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f12, 0x8(%[pSrc])              \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f18, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f16, 0xF(%[pSrc])              \n\t"
+    "gsldrc1    $f18, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f16, 0x8(%[pSrc])              \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f22, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f20, 0xF(%[pSrc])              \n\t"
+    "gsldrc1    $f22, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f20, 0x8(%[pSrc])              \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f26, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f24, 0xF(%[pSrc])              \n\t"
+    "gsldrc1    $f26, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f24, 0x8(%[pSrc])              \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f30, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f28, 0xF(%[pSrc])              \n\t"
+    "gsldrc1    $f30, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f28, 0x8(%[pSrc])              \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+
+    PTR_ADDU   "$8, %[pDst], %[iDstStride]      \n\t"
+    "gssqc1     $f0, $f2, 0x0(%[pDst])          \n\t"
+    "gssqc1     $f4, $f6, 0x0($8)               \n\t"
+    PTR_ADDU   "%[pDst], $8, %[iDstStride]      \n\t"
+    PTR_ADDU   "$8, %[pDst], %[iDstStride]      \n\t"
+    "gssqc1     $f8, $f10, 0x0(%[pDst])         \n\t"
+    "gssqc1     $f12, $f14, 0x0($8)             \n\t"
+    PTR_ADDU   "%[pDst], $8, %[iDstStride]      \n\t"
+    PTR_ADDU   "$8, %[pDst], %[iDstStride]      \n\t"
+    "gssqc1     $f16, $f18, 0x0(%[pDst])        \n\t"
+    "gssqc1     $f20, $f22, 0x0($8)             \n\t"
+    PTR_ADDU   "%[pDst], $8, %[iDstStride]      \n\t"
+    PTR_ADDU   "$8, %[pDst], %[iDstStride]      \n\t"
+    "gssqc1     $f24, $f26, 0x0(%[pDst])        \n\t"
+    "gssqc1     $f28, $f30, 0x0($8)             \n\t"
+    PTR_ADDU   "%[pDst], $8, %[iDstStride]      \n\t"
+
+    "gsldlc1    $f2, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f0, 0xF(%[pSrc])               \n\t"
+    "gsldrc1    $f2, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f0, 0x8(%[pSrc])               \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f6, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f4, 0xF(%[pSrc])               \n\t"
+    "gsldrc1    $f6, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f4, 0x8(%[pSrc])               \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f10, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f8, 0xF(%[pSrc])               \n\t"
+    "gsldrc1    $f10, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f8, 0x8(%[pSrc])               \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f14, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f12, 0xF(%[pSrc])              \n\t"
+    "gsldrc1    $f14, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f12, 0x8(%[pSrc])              \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f18, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f16, 0xF(%[pSrc])              \n\t"
+    "gsldrc1    $f18, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f16, 0x8(%[pSrc])              \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f22, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f20, 0xF(%[pSrc])              \n\t"
+    "gsldrc1    $f22, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f20, 0x8(%[pSrc])              \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f26, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f24, 0xF(%[pSrc])              \n\t"
+    "gsldrc1    $f26, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f24, 0x8(%[pSrc])              \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f30, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f28, 0xF(%[pSrc])              \n\t"
+    "gsldrc1    $f30, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f28, 0x8(%[pSrc])              \n\t"
+
+    PTR_ADDU   "$8, %[pDst], %[iDstStride]      \n\t"
+    "gssqc1     $f0, $f2, 0x0(%[pDst])          \n\t"
+    "gssqc1     $f4, $f6, 0x0($8)               \n\t"
+    PTR_ADDU   "%[pDst], $8, %[iDstStride]      \n\t"
+    PTR_ADDU   "$8, %[pDst], %[iDstStride]      \n\t"
+    "gssqc1     $f8, $f10, 0x0(%[pDst])         \n\t"
+    "gssqc1     $f12, $f14, 0x0($8)             \n\t"
+    PTR_ADDU   "%[pDst], $8, %[iDstStride]      \n\t"
+    PTR_ADDU   "$8, %[pDst], %[iDstStride]      \n\t"
+    "gssqc1     $f16, $f18, 0x0(%[pDst])        \n\t"
+    "gssqc1     $f20, $f22, 0x0($8)             \n\t"
+    PTR_ADDU   "%[pDst], $8, %[iDstStride]      \n\t"
+    PTR_ADDU   "$8, %[pDst], %[iDstStride]      \n\t"
+    "gssqc1     $f24, $f26, 0x0(%[pDst])        \n\t"
+    "gssqc1     $f28, $f30, 0x0($8)             \n\t"
+   : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
+   : [iDstStride]"r"((int)iDstStride), [iSrcStride]"r"((int)iSrcStride)
+   : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+     "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void WelsCopy16x8NotAligned_mmi(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrc,
+                                int32_t iSrcStride) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                 \n\t"
+    "gsldlc1    $f2, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f0, 0xF(%[pSrc])               \n\t"
+    "gsldrc1    $f2, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f0, 0x8(%[pSrc])               \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f6, 0x7(%[pSrc])               \n\t"
+    "gsldlc1    $f4, 0xF(%[pSrc])               \n\t"
+    "gsldrc1    $f6, 0x0(%[pSrc])               \n\t"
+    "gsldrc1    $f4, 0x8(%[pSrc])               \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f10, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f8, 0xF(%[pSrc])               \n\t"
+    "gsldrc1    $f10, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f8, 0x8(%[pSrc])               \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f14, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f12, 0xF(%[pSrc])              \n\t"
+    "gsldrc1    $f14, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f12, 0x8(%[pSrc])              \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f18, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f16, 0xF(%[pSrc])              \n\t"
+    "gsldrc1    $f18, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f16, 0x8(%[pSrc])              \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f22, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f20, 0xF(%[pSrc])              \n\t"
+    "gsldrc1    $f22, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f20, 0x8(%[pSrc])              \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f26, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f24, 0xF(%[pSrc])              \n\t"
+    "gsldrc1    $f26, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f24, 0x8(%[pSrc])              \n\t"
+    PTR_ADDU   "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+    "gsldlc1    $f30, 0x7(%[pSrc])              \n\t"
+    "gsldlc1    $f28, 0xF(%[pSrc])              \n\t"
+    "gsldrc1    $f30, 0x0(%[pSrc])              \n\t"
+    "gsldrc1    $f28, 0x8(%[pSrc])              \n\t"
+
+    "gssqc1     $f0, $f2, 0x0(%[pDst])          \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f4, $f6, 0x0(%[pDst])          \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f8, $f10, 0x0(%[pDst])         \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f12, $f14, 0x0(%[pDst])        \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f16, $f18, 0x0(%[pDst])        \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f20, $f22, 0x0(%[pDst])        \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f24, $f26, 0x0(%[pDst])        \n\t"
+    PTR_ADDU   "%[pDst], %[pDst], %[iDstStride] \n\t"
+    "gssqc1     $f28, $f30, 0x0(%[pDst])        \n\t"
+   : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
+   : [iDstStride]"r"((int)iDstStride), [iSrcStride]"r"((int)iSrcStride)
+   : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+     "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
--- /dev/null
+++ b/codec/common/mips/expand_picture_mmi.c
@@ -1,0 +1,673 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2018, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file    expand_picture_mmi.c
+ *
+ * \brief   Loongson optimization
+ *
+ * \date    24/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+#define mov_line_8x4_mmi_aligned(r0, r1, f0) \
+  "gssdxc1    "#f0", 0x0("#r0", $0)           \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssdxc1    "#f0", 0x0("#r0", $0)           \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssdxc1    "#f0", 0x0("#r0", $0)           \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssdxc1    "#f0", 0x0("#r0", $0)           \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t"
+
+#define mov_line_8x4_mmi_unaligned(r0, r1, f0) \
+  "gssdlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gssdrc1    "#f0", 0x0("#r0")               \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssdlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gssdrc1    "#f0", 0x0("#r0")               \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssdlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gssdrc1    "#f0", 0x0("#r0")               \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssdlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gssdrc1    "#f0", 0x0("#r0")               \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t"
+
+#define mov_line_end8x4_mmi_aligned(r0, r1, f0) \
+  "gssdxc1    "#f0", 0x0("#r0", $0)           \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssdxc1    "#f0", 0x0("#r0", $0)           \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssdxc1    "#f0", 0x0("#r0", $0)           \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssdxc1    "#f0", 0x0("#r0", $0)           \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t"
+
+#define mov_line_end8x4_mmi_unaligned(r0, r1, f0) \
+  "gssdlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gssdrc1    "#f0", 0x0("#r0")               \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssdlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gssdrc1    "#f0", 0x0("#r0")               \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssdlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gssdrc1    "#f0", 0x0("#r0")               \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssdlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gssdrc1    "#f0", 0x0("#r0")               \n\t" \
+
+#define mov_line_16x4_mmi_aligned(r0, r1, f0, f2) \
+  "gssqc1     "#f2", "#f0", 0x0("#r0")        \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssqc1     "#f2", "#f0", 0x0("#r0")        \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssqc1     "#f2", "#f0", 0x0("#r0")        \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssqc1     "#f2", "#f0", 0x0("#r0")        \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t"
+
+#define mov_line_16x4_mmi_unaligned(r0, r1, f0, f2) \
+  "gssdlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gssdlc1    "#f2", 0xF("#r0")               \n\t" \
+  "gssdrc1    "#f0", 0x0("#r0")               \n\t" \
+  "gssdrc1    "#f2", 0x8("#r0")               \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssdlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gssdlc1    "#f2", 0xF("#r0")               \n\t" \
+  "gssdrc1    "#f0", 0x0("#r0")               \n\t" \
+  "gssdrc1    "#f2", 0x8("#r0")               \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssdlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gssdlc1    "#f2", 0xF("#r0")               \n\t" \
+  "gssdrc1    "#f0", 0x0("#r0")               \n\t" \
+  "gssdrc1    "#f2", 0x8("#r0")               \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssdlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gssdlc1    "#f2", 0xF("#r0")               \n\t" \
+  "gssdrc1    "#f0", 0x0("#r0")               \n\t" \
+  "gssdrc1    "#f2", 0x8("#r0")               \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t"
+
+#define mov_line_end16x4_mmi_aligned(r0, r1, f0, f2) \
+  "gssqc1     "#f2", "#f0", 0x0("#r0")        \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssqc1     "#f2", "#f0", 0x0("#r0")        \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssqc1     "#f2", "#f0", 0x0("#r0")        \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssqc1     "#f2", "#f0", 0x0("#r0")        \n\t"
+
+#define mov_line_end16x4_mmi_unaligned(r0, r1, f0, f2) \
+  "gssdlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gssdlc1    "#f2", 0xF("#r0")               \n\t" \
+  "gssdrc1    "#f0", 0x0("#r0")               \n\t" \
+  "gssdrc1    "#f2", 0x8("#r0")               \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssdlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gssdlc1    "#f2", 0xF("#r0")               \n\t" \
+  "gssdrc1    "#f0", 0x0("#r0")               \n\t" \
+  "gssdrc1    "#f2", 0x8("#r0")               \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssdlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gssdlc1    "#f2", 0xF("#r0")               \n\t" \
+  "gssdrc1    "#f0", 0x0("#r0")               \n\t" \
+  "gssdrc1    "#f2", 0x8("#r0")               \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"             \n\t" \
+  "gssdlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gssdlc1    "#f2", 0xF("#r0")               \n\t" \
+  "gssdrc1    "#f0", 0x0("#r0")               \n\t" \
+  "gssdrc1    "#f2", 0x8("#r0")               \n\t" \
+
+#define exp_top_bottom_mmi_32 \
+  "dsra       %[iWidth], %[iWidth], 0x4              \n\t" \
+  "1:                                                \n\t" \
+  "gslqc1     $f2, $f0, 0x0(%[pDst])                 \n\t" \
+  mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2)      \
+  mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2)      \
+  mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2)      \
+  mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2)      \
+  mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2)      \
+  mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2)      \
+  mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2)      \
+  mov_line_end16x4_mmi_aligned($9, %[iStride], $f0, $f2)   \
+  "gslqc1     $f6, $f4, 0x0(%[iHeight])              \n\t" \
+  mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6)     \
+  mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6)     \
+  mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6)     \
+  mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6)     \
+  mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6)     \
+  mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6)     \
+  mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6)     \
+  mov_line_end16x4_mmi_aligned($11, %[iStride], $f4, $f6)  \
+  PTR_ADDIU  "%[pDst], %[pDst], 0x10                 \n\t" \
+  PTR_ADDIU  "$9, $9, 0x10                           \n\t" \
+  PTR_ADDIU  "%[iHeight], %[iHeight], 0x10           \n\t" \
+  PTR_ADDIU  "$11, $11, 0x10                         \n\t" \
+  "dnegu      %[iStride], %[iStride]                 \n\t" \
+  PTR_ADDIU  "%[iWidth], %[iWidth], -0x1             \n\t" \
+  "bnez       %[iWidth], 1b                          \n\t" \
+  "nop                                               \n\t"
+
+#define exp_left_right_mmi_32 \
+  "2:                                             \n\t" \
+  "lbu        %[iWidth], 0x0(%[pDst])             \n\t" \
+  MMI_Copy16Times($f0, $f2, $f28, %[iWidth])            \
+  "gssqc1     $f2, $f0, 0x0($9)                   \n\t" \
+  "gssqc1     $f2, $f0, 0x10($9)                  \n\t" \
+  "lbu        %[iWidth], 0x0(%[iHeight])          \n\t" \
+  MMI_Copy16Times($f4, $f6, $f28, %[iWidth])            \
+  "gssqc1     $f6, $f4, 0x0($11)                  \n\t" \
+  "gssqc1     $f6, $f4, 0x10($11)                 \n\t" \
+  PTR_ADDU   "%[pDst], %[pDst], %[iStride]        \n\t" \
+  PTR_ADDU   "$9, $9, %[iStride]                  \n\t" \
+  PTR_ADDU   "%[iHeight], %[iHeight], %[iStride]  \n\t" \
+  PTR_ADDU   "$11, $11, %[iStride]                \n\t" \
+  PTR_ADDIU  "$8, $8, -0x1                        \n\t" \
+  "bnez       $8, 2b                              \n\t" \
+  "nop                                            \n\t"
+
+#define mov_line_32x4_mmi(r0, r1, f0, f2) \
+  "gssqc1     "#f2", "#f0", 0x0("#r0")         \n\t" \
+  "gssqc1     "#f2", "#f0", 0x10("#r0")        \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"              \n\t" \
+  "gssqc1     "#f2", "#f0", 0x0("#r0")         \n\t" \
+  "gssqc1     "#f2", "#f0", 0x10("#r0")        \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"              \n\t" \
+  "gssqc1     "#f2", "#f0", 0x0("#r0")         \n\t" \
+  "gssqc1     "#f2", "#f0", 0x10("#r0")        \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"              \n\t" \
+  "gssqc1     "#f2", "#f0", 0x0("#r0")         \n\t" \
+  "gssqc1     "#f2", "#f0", 0x10("#r0")        \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"              \n\t"
+
+#define mov_line_end32x4_mmi(r0, r1, f0, f2) \
+  "gssqc1     "#f2", "#f0", 0x0("#r0")         \n\t" \
+  "gssqc1     "#f2", "#f0", 0x10("#r0")        \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"              \n\t" \
+  "gssqc1     "#f2", "#f0", 0x0("#r0")         \n\t" \
+  "gssqc1     "#f2", "#f0", 0x10("#r0")        \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"              \n\t" \
+  "gssqc1     "#f2", "#f0", 0x0("#r0")         \n\t" \
+  "gssqc1     "#f2", "#f0", 0x10("#r0")        \n\t" \
+  PTR_ADDU   ""#r0", "#r0", "#r1"              \n\t" \
+  "gssqc1     "#f2", "#f0", 0x0("#r0")         \n\t" \
+  "gssqc1     "#f2", "#f0", 0x10("#r0")        \n\t"
+
+#define  exp_cross_mmi_32 \
+  mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14)        \
+  mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14)        \
+  mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14)        \
+  mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14)        \
+  mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14)        \
+  mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14)        \
+  mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14)        \
+  mov_line_end32x4_mmi(%[iHeight], %[iStride], $f12, $f14)     \
+  mov_line_32x4_mmi($11, %[iStride], $f16, $f18)               \
+  mov_line_32x4_mmi($11, %[iStride], $f16, $f18)               \
+  mov_line_32x4_mmi($11, %[iStride], $f16, $f18)               \
+  mov_line_32x4_mmi($11, %[iStride], $f16, $f18)               \
+  mov_line_32x4_mmi($11, %[iStride], $f16, $f18)               \
+  mov_line_32x4_mmi($11, %[iStride], $f16, $f18)               \
+  mov_line_32x4_mmi($11, %[iStride], $f16, $f18)               \
+  mov_line_end32x4_mmi($11, %[iStride], $f16, $f18)            \
+  mov_line_32x4_mmi($9, %[iStride], $f20, $f22)                \
+  mov_line_32x4_mmi($9, %[iStride], $f20, $f22)                \
+  mov_line_32x4_mmi($9, %[iStride], $f20, $f22)                \
+  mov_line_32x4_mmi($9, %[iStride], $f20, $f22)                \
+  mov_line_32x4_mmi($9, %[iStride], $f20, $f22)                \
+  mov_line_32x4_mmi($9, %[iStride], $f20, $f22)                \
+  mov_line_32x4_mmi($9, %[iStride], $f20, $f22)                \
+  mov_line_end32x4_mmi($9, %[iStride], $f20, $f22)             \
+  mov_line_32x4_mmi($8, %[iStride], $f24, $f26)                \
+  mov_line_32x4_mmi($8, %[iStride], $f24, $f26)                \
+  mov_line_32x4_mmi($8, %[iStride], $f24, $f26)                \
+  mov_line_32x4_mmi($8, %[iStride], $f24, $f26)                \
+  mov_line_32x4_mmi($8, %[iStride], $f24, $f26)                \
+  mov_line_32x4_mmi($8, %[iStride], $f24, $f26)                \
+  mov_line_32x4_mmi($8, %[iStride], $f24, $f26)                \
+  mov_line_end32x4_mmi($8, %[iStride], $f24, $f26)
+
+#define exp_top_bottom_mmi_16_aligned \
+  "move       $8, %[iWidth]                              \n\t" \
+  "dsra       %[iWidth], %[iWidth], 0x4                  \n\t" \
+  "1:                                                    \n\t" \
+  "gslqc1     $f2, $f0, 0x0(%[pDst])                     \n\t" \
+  mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2)          \
+  mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2)          \
+  mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2)          \
+  mov_line_end16x4_mmi_aligned($9, %[iStride], $f0, $f2)       \
+  "gslqc1     $f6, $f4, 0x0(%[iHeight])                  \n\t" \
+  mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6)         \
+  mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6)         \
+  mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6)         \
+  mov_line_end16x4_mmi_aligned($11, %[iStride], $f4, $f6)      \
+  PTR_ADDIU  "%[pDst], %[pDst], 0x10                     \n\t" \
+  PTR_ADDIU  "$9, $9, 0x10                               \n\t" \
+  PTR_ADDIU  "%[iHeight], %[iHeight], 0x10               \n\t" \
+  PTR_ADDIU  "$11, $11, 0x10                             \n\t" \
+  "dnegu      %[iStride], %[iStride]                     \n\t" \
+  PTR_ADDIU  "%[iWidth], %[iWidth], -0x1                 \n\t" \
+  "bnez       %[iWidth], 1b                              \n\t" \
+  "nop                                                   \n\t" \
+  "and        $8, 0x0F                                   \n\t" \
+  "beqz       $8, 2f                                     \n\t" \
+  "nop                                                   \n\t" \
+  "gsldxc1    $f0, 0x0(%[pDst], $0)                      \n\t" \
+  mov_line_8x4_mmi_aligned($9, %[iStride], $f0)                \
+  mov_line_8x4_mmi_aligned($9, %[iStride], $f0)                \
+  mov_line_8x4_mmi_aligned($9, %[iStride], $f0)                \
+  mov_line_end8x4_mmi_aligned($9, %[iStride], $f0)             \
+  "gsldxc1    $f4, 0x0(%[iHeight], $0)                   \n\t" \
+  mov_line_8x4_mmi_aligned($11, %[iStride], $f4)               \
+  mov_line_8x4_mmi_aligned($11, %[iStride], $f4)               \
+  mov_line_8x4_mmi_aligned($11, %[iStride], $f4)               \
+  mov_line_end8x4_mmi_aligned($11, %[iStride], $f4)            \
+  "2:                                                    \n\t"
+
+#define exp_top_bottom_mmi_16_unaligned \
+  "move       $8, %[iWidth]                              \n\t" \
+  "dsra       %[iWidth], %[iWidth], 0x4                  \n\t" \
+  "1:                                                    \n\t" \
+  "gsldlc1    $f0, 0x7(%[pDst])                          \n\t" \
+  "gsldlc1    $f2, 0xF(%[pDst])                          \n\t" \
+  "gsldrc1    $f0, 0x0(%[pDst])                          \n\t" \
+  "gsldrc1    $f2, 0x8(%[pDst])                          \n\t" \
+  mov_line_16x4_mmi_unaligned($9, %[iStride], $f0, $f2)        \
+  mov_line_16x4_mmi_unaligned($9, %[iStride], $f0, $f2)        \
+  mov_line_16x4_mmi_unaligned($9, %[iStride], $f0, $f2)        \
+  mov_line_end16x4_mmi_unaligned($9, %[iStride], $f0, $f2)     \
+  "gsldlc1    $f4, 0x7(%[iHeight])                       \n\t" \
+  "gsldlc1    $f6, 0xF(%[iHeight])                       \n\t" \
+  "gsldrc1    $f4, 0x0(%[iHeight])                       \n\t" \
+  "gsldrc1    $f6, 0x8(%[iHeight])                       \n\t" \
+  mov_line_16x4_mmi_unaligned($11, %[iStride], $f4, $f6)       \
+  mov_line_16x4_mmi_unaligned($11, %[iStride], $f4, $f6)       \
+  mov_line_16x4_mmi_unaligned($11, %[iStride], $f4, $f6)       \
+  mov_line_end16x4_mmi_unaligned($11, %[iStride], $f4, $f6)    \
+  PTR_ADDIU  "%[pDst], %[pDst], 0x10                     \n\t" \
+  PTR_ADDIU  "$9, $9, 0x10                               \n\t" \
+  PTR_ADDIU  "%[iHeight], %[iHeight], 0x10               \n\t" \
+  PTR_ADDIU  "$11, $11, 0x10                             \n\t" \
+  "dnegu      %[iStride], %[iStride]                     \n\t" \
+  PTR_ADDIU  "%[iWidth], %[iWidth], -0x1                 \n\t" \
+  "bnez       %[iWidth], 1b                              \n\t" \
+  "nop                                                   \n\t" \
+  "and        $8, 0x0F                                   \n\t" \
+  "beqz       $8, 2f                                     \n\t" \
+  "nop                                                   \n\t" \
+  "gsldlc1    $f0, 0x7(%[pDst])                          \n\t" \
+  "gsldrc1    $f0, 0x0(%[pDst])                          \n\t" \
+  mov_line_8x4_mmi_unaligned($9, %[iStride], $f0)              \
+  mov_line_8x4_mmi_unaligned($9, %[iStride], $f0)              \
+  mov_line_8x4_mmi_unaligned($9, %[iStride], $f0)              \
+  mov_line_end8x4_mmi_unaligned($9, %[iStride], $f0)           \
+  "gsldlc1    $f4, 0x7(%[iHeight])                       \n\t" \
+  "gsldrc1    $f4, 0x0(%[iHeight])                       \n\t" \
+  mov_line_8x4_mmi_unaligned($11, %[iStride], $f4)             \
+  mov_line_8x4_mmi_unaligned($11, %[iStride], $f4)             \
+  mov_line_8x4_mmi_unaligned($11, %[iStride], $f4)             \
+  mov_line_end8x4_mmi_unaligned($11, %[iStride], $f4)          \
+  "2:                                                    \n\t"
+
+#define exp_left_right_mmi_16_aligned \
+  "3:                                             \n\t" \
+  "lbu        %[iWidth], 0x0(%[pDst])             \n\t" \
+  MMI_Copy16Times($f0, $f2, $f28, %[iWidth])            \
+  "gssqc1     $f2, $f0, 0x0($9)                   \n\t" \
+  "lbu        %[iWidth], 0x0(%[iHeight])          \n\t" \
+  MMI_Copy16Times($f4, $f6, $f28, %[iWidth])            \
+  "gssqc1     $f6, $f4, 0x0($11)                  \n\t" \
+  PTR_ADDU   "%[pDst], %[pDst], %[iStride]        \n\t" \
+  PTR_ADDU   "$9, $9, %[iStride]                  \n\t" \
+  PTR_ADDU   "%[iHeight], %[iHeight], %[iStride]  \n\t" \
+  PTR_ADDU   "$11, $11, %[iStride]                \n\t" \
+  PTR_ADDIU  "$8, $8, -0x1                        \n\t" \
+  "bnez       $8, 3b                              \n\t" \
+  "nop                                            \n\t"
+
+#define exp_left_right_mmi_16_unaligned \
+  "3:                                             \n\t" \
+  "lbu        %[iWidth], 0x0(%[pDst])             \n\t" \
+  MMI_Copy16Times($f0, $f2, $f28, %[iWidth])            \
+  "gssdlc1    $f0, 0x7($9)                        \n\t" \
+  "gssdlc1    $f2, 0xF($9)                        \n\t" \
+  "gssdrc1    $f0, 0x0($9)                        \n\t" \
+  "gssdrc1    $f2, 0x8($9)                        \n\t" \
+  "lbu        %[iWidth], 0x0(%[iHeight])          \n\t" \
+  MMI_Copy16Times($f4, $f6, $f28, %[iWidth])            \
+  "gssdlc1    $f4, 0x7($11)                       \n\t" \
+  "gssdlc1    $f6, 0xF($11)                       \n\t" \
+  "gssdrc1    $f4, 0x0($11)                       \n\t" \
+  "gssdrc1    $f6, 0x8($11)                       \n\t" \
+  PTR_ADDU   "%[pDst], %[pDst], %[iStride]        \n\t" \
+  PTR_ADDU   "$9, $9, %[iStride]                  \n\t" \
+  PTR_ADDU   "%[iHeight], %[iHeight], %[iStride]  \n\t" \
+  PTR_ADDU   "$11, $11, %[iStride]                \n\t" \
+  PTR_ADDIU  "$8, $8, -0x1                        \n\t" \
+  "bnez       $8, 3b                              \n\t" \
+  "nop                                            \n\t"
+
+#define exp_cross_mmi_16_aligned \
+  mov_line_16x4_mmi_aligned(%[iHeight], %[iStride], $f12, $f14)        \
+  mov_line_16x4_mmi_aligned(%[iHeight], %[iStride], $f12, $f14)        \
+  mov_line_16x4_mmi_aligned(%[iHeight], %[iStride], $f12, $f14)        \
+  mov_line_end16x4_mmi_aligned(%[iHeight], %[iStride], $f12, $f14)     \
+  mov_line_16x4_mmi_aligned($11, %[iStride], $f16, $f18)               \
+  mov_line_16x4_mmi_aligned($11, %[iStride], $f16, $f18)               \
+  mov_line_16x4_mmi_aligned($11, %[iStride], $f16, $f18)               \
+  mov_line_end16x4_mmi_aligned($11, %[iStride], $f16, $f18)            \
+  mov_line_16x4_mmi_aligned($9, %[iStride], $f20, $f22)                \
+  mov_line_16x4_mmi_aligned($9, %[iStride], $f20, $f22)                \
+  mov_line_16x4_mmi_aligned($9, %[iStride], $f20, $f22)                \
+  mov_line_end16x4_mmi_aligned($9, %[iStride], $f20, $f22)             \
+  mov_line_16x4_mmi_aligned($8, %[iStride], $f24, $f26)                \
+  mov_line_16x4_mmi_aligned($8, %[iStride], $f24, $f26)                \
+  mov_line_16x4_mmi_aligned($8, %[iStride], $f24, $f26)                \
+  mov_line_end16x4_mmi_aligned($8, %[iStride], $f24, $f26)
+
+#define exp_cross_mmi_16_unaligned \
+  mov_line_16x4_mmi_unaligned(%[iHeight], %[iStride], $f12, $f14)      \
+  mov_line_16x4_mmi_unaligned(%[iHeight], %[iStride], $f12, $f14)      \
+  mov_line_16x4_mmi_unaligned(%[iHeight], %[iStride], $f12, $f14)      \
+  mov_line_end16x4_mmi_unaligned(%[iHeight], %[iStride], $f12, $f14)   \
+  mov_line_16x4_mmi_unaligned($11, %[iStride], $f16, $f18)             \
+  mov_line_16x4_mmi_unaligned($11, %[iStride], $f16, $f18)             \
+  mov_line_16x4_mmi_unaligned($11, %[iStride], $f16, $f18)             \
+  mov_line_end16x4_mmi_unaligned($11, %[iStride], $f16, $f18)          \
+  mov_line_16x4_mmi_unaligned($9, %[iStride], $f20, $f22)              \
+  mov_line_16x4_mmi_unaligned($9, %[iStride], $f20, $f22)              \
+  mov_line_16x4_mmi_unaligned($9, %[iStride], $f20, $f22)              \
+  mov_line_end16x4_mmi_unaligned($9, %[iStride], $f20, $f22)           \
+  mov_line_16x4_mmi_unaligned($8, %[iStride], $f24, $f26)              \
+  mov_line_16x4_mmi_unaligned($8, %[iStride], $f24, $f26)              \
+  mov_line_16x4_mmi_unaligned($8, %[iStride], $f24, $f26)              \
+  mov_line_end16x4_mmi_unaligned($8, %[iStride], $f24, $f26)
+
+void ExpandPictureLuma_mmi(uint8_t *pDst, int32_t iStride, int32_t iWidth,
+                           int32_t iHeight) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                     \n\t"
+    "xor        $f28, $f28, $f28                    \n\t"
+    "lbu        $8, 0x0(%[pDst])                    \n\t"
+
+    MMI_Copy16Times($f12, $f14, $f28, $8)
+
+    "dnegu      %[iStride], %[iStride]              \n\t"
+    PTR_ADDU   "$9, %[pDst], %[iStride]             \n\t"
+    "dnegu      %[iStride], %[iStride]              \n\t"
+    "move       $10, %[iHeight]                     \n\t"
+    PTR_ADDU   "%[iHeight], %[iHeight], -0x1        \n\t"
+    "dmul       %[iHeight], %[iHeight], %[iStride]  \n\t"
+    PTR_ADDU   "%[iHeight], %[iHeight], %[pDst]     \n\t"
+
+    "move       $8, %[iStride]                      \n\t"
+    "dsll       $8, 0x5                             \n\t"
+    PTR_ADDU   "$11, %[iHeight], $8                 \n\t"
+
+    "lbu        $8, 0x0(%[iHeight])                 \n\t"
+    MMI_Copy16Times($f20, $f22, $f28, $8)
+    PTR_ADDU   "$8, %[iHeight], %[iWidth]           \n\t"
+    PTR_ADDIU  "$8, -0x1                            \n\t"
+    "lbu        $8, 0x0($8)                         \n\t"
+    "dmtc1      $8, $f24                            \n\t"
+    "pshufh     $f24, $f24, $f28                    \n\t"
+    "packushb   $f24, $f24, $f24                    \n\t"
+    "mov.d      $f26, $f24                          \n\t"
+    "dnegu      %[iStride], %[iStride]              \n\t"
+    "move       $12, %[pDst]                        \n\t"
+    "move       $13, %[iStride]                     \n\t"
+    "move       $14, %[iWidth]                      \n\t"
+    exp_top_bottom_mmi_32
+    "move       %[iWidth], $14                      \n\t"
+    "move       %[iStride], $13                     \n\t"
+    "move       %[pDst], $12                        \n\t"
+    PTR_ADDIU  "$9, %[pDst], -0x20                  \n\t"
+    PTR_ADDU   "%[iHeight], %[pDst], %[iWidth]      \n\t"
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1        \n\t"
+    PTR_ADDIU  "$11, %[iHeight], 0x1                \n\t"
+    "lbu        $8, 0x0(%[iHeight])                 \n\t"
+    MMI_Copy16Times($f16, $f18, $f28, $8)
+    "dnegu      %[iStride], %[iStride]              \n\t"
+    "move       $8, $10                             \n\t"
+    "move       $10, %[pDst]                        \n\t"
+    "move       $12, %[iStride]                     \n\t"
+    "move       $13, %[iWidth]                      \n\t"
+    "move       $14, $8                             \n\t"
+
+    exp_left_right_mmi_32
+
+    "move       $8, $14                             \n\t"
+    "move       %[iWidth], $13                      \n\t"
+    "move       %[iStride], $12                     \n\t"
+    "move       %[pDst], $10                        \n\t"
+    "dnegu      %[iStride], %[iStride]              \n\t"
+    PTR_ADDIU  "%[iHeight], %[pDst], -0x20          \n\t"
+    PTR_ADDU   "%[iHeight], %[iHeight], %[iStride]  \n\t"
+    PTR_ADDU   "$11, %[pDst], %[iWidth]             \n\t"
+    PTR_ADDU   "$11, $11, %[iStride]                \n\t"
+    "dnegu      %[iStride], %[iStride]              \n\t"
+    PTR_ADDIU  "$8, $8, 0x20                        \n\t"
+    "dmul       $8, $8, %[iStride]                  \n\t"
+    PTR_ADDU   "$9, %[iHeight], $8                  \n\t"
+    PTR_ADDU   "$8, $11, $8                         \n\t"
+    "dnegu      %[iStride], %[iStride]              \n\t"
+    exp_cross_mmi_32
+    : [pDst]"+&r"((unsigned char *)pDst), [iStride]"+&r"((int)iStride),
+      [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
+    :
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
+      "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+      "$f22", "$f24", "$f26", "$f28"
+  );
+  RECOVER_REG;
+}
+
+void ExpandPictureChromaUnalign_mmi(uint8_t *pDst, int32_t iStride, int32_t iWidth,
+                                    int32_t iHeight) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                     \n\t"
+    "xor        $f28, $f28, $f28                    \n\t"
+    "lbu        $8, 0x0(%[pDst])                    \n\t"
+
+    MMI_Copy16Times($f12, $f14, $f28, $8)
+
+    "dnegu      %[iStride], %[iStride]              \n\t"
+    PTR_ADDU   "$9, %[pDst], %[iStride]             \n\t"
+    "dnegu      %[iStride], %[iStride]              \n\t"
+    "move       $10, %[iHeight]                     \n\t"
+    PTR_ADDU   "%[iHeight], %[iHeight], -0x1        \n\t"
+    "dmul       %[iHeight], %[iHeight], %[iStride]  \n\t"
+    PTR_ADDU   "%[iHeight], %[iHeight], %[pDst]     \n\t"
+    "move       $8, %[iStride]                      \n\t"
+    "dsll       $8, 0x4                             \n\t"
+    PTR_ADDU   "$11, %[iHeight], $8                 \n\t"
+    "lbu        $8, 0x0(%[iHeight])                 \n\t"
+
+    MMI_Copy16Times($f20, $f22, $f28, $8)
+
+    PTR_ADDU   "$8, %[iHeight], %[iWidth]           \n\t"
+    PTR_ADDIU  "$8, -0x1                            \n\t"
+    "lbu        $8, 0x0($8)                         \n\t"
+
+    MMI_Copy16Times($f24, $f26, $f28, $8)
+
+    "dnegu      %[iStride], %[iStride]              \n\t"
+    "move       $12, %[pDst]                        \n\t"
+    "move       $13, %[iStride]                     \n\t"
+    "move       $14, %[iWidth]                      \n\t"
+
+    exp_top_bottom_mmi_16_unaligned
+
+    "move       %[iWidth], $14                      \n\t"
+    "move       %[iStride], $13                     \n\t"
+    "move       %[pDst], $12                        \n\t"
+    PTR_ADDIU  "$9, %[pDst], -0x10                  \n\t"
+    PTR_ADDU   "%[iHeight], %[pDst], %[iWidth]      \n\t"
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1        \n\t"
+    PTR_ADDIU  "$11, %[iHeight], 0x1                \n\t"
+    "lbu        $8, 0x0(%[iHeight])                 \n\t"
+    MMI_Copy16Times($f16, $f18, $f28, $8)
+
+    "dnegu      %[iStride], %[iStride]              \n\t"
+    "move       $8, $10                             \n\t"
+
+    "move       $10, %[pDst]                        \n\t"
+    "move       $12, %[iStride]                     \n\t"
+    "move       $13, %[iWidth]                      \n\t"
+    "move       $14, $8                             \n\t"
+
+    exp_left_right_mmi_16_unaligned
+
+    "move       $8, $14                             \n\t"
+    "move       %[iWidth], $13                      \n\t"
+    "move       %[iStride], $12                     \n\t"
+    "move       %[pDst], $10                        \n\t"
+
+    "dnegu      %[iStride], %[iStride]              \n\t"
+    PTR_ADDIU  "%[iHeight], %[pDst], -0x10          \n\t"
+    PTR_ADDU   "%[iHeight], %[iHeight], %[iStride]  \n\t"
+    PTR_ADDU   "$11, %[pDst], %[iWidth]             \n\t"
+    PTR_ADDU   "$11, $11, %[iStride]                \n\t"
+
+    "dnegu      %[iStride], %[iStride]              \n\t"
+    PTR_ADDIU  "$8, $8, 0x10                        \n\t"
+    "dmul       $8, $8, %[iStride]                  \n\t"
+
+    PTR_ADDU   "$9, %[iHeight], $8                  \n\t"
+    PTR_ADDU   "$8, $11, $8                         \n\t"
+    "dnegu      %[iStride], %[iStride]              \n\t"
+
+    exp_cross_mmi_16_unaligned
+    : [pDst]"+&r"((unsigned char *)pDst), [iStride]"+&r"((int)iStride),
+      [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
+    :
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
+      "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+      "$f22", "$f24", "$f26", "$f28"
+  );
+  RECOVER_REG;
+}
+
+void ExpandPictureChromaAlign_mmi(uint8_t *pDst, int32_t iStride, int32_t iWidth,
+                                  int32_t iHeight) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                     \n\t"
+    "xor        $f28, $f28, $f28                    \n\t"
+    "lbu        $8, 0x0(%[pDst])                    \n\t"
+
+    MMI_Copy16Times($f12, $f14, $f28, $8)
+
+    "dnegu      %[iStride], %[iStride]              \n\t"
+    PTR_ADDU   "$9, %[pDst], %[iStride]             \n\t"
+    "dnegu      %[iStride], %[iStride]              \n\t"
+    "move       $10, %[iHeight]                     \n\t"
+    PTR_ADDU   "%[iHeight], %[iHeight], -0x1        \n\t"
+    "dmul       %[iHeight], %[iHeight], %[iStride]  \n\t"
+    PTR_ADDU   "%[iHeight], %[iHeight], %[pDst]     \n\t"
+    "move       $8, %[iStride]                      \n\t"
+    "dsll       $8, 0x4                             \n\t"
+    PTR_ADDU   "$11, %[iHeight], $8                 \n\t"
+    "lbu        $8, 0x0(%[iHeight])                 \n\t"
+
+    MMI_Copy16Times($f20, $f22, $f28, $8)
+
+    PTR_ADDU   "$8, %[iHeight], %[iWidth]           \n\t"
+    PTR_ADDIU  "$8, -0x1                            \n\t"
+    "lbu        $8, 0x0($8)                         \n\t"
+
+    MMI_Copy16Times($f24, $f26, $f28, $8)
+
+    "dnegu      %[iStride], %[iStride]              \n\t"
+
+    "move       $12, %[pDst]                        \n\t"
+    "move       $13, %[iStride]                     \n\t"
+    "move       $14, %[iWidth]                      \n\t"
+    exp_top_bottom_mmi_16_aligned
+
+    "move       %[iWidth], $14                      \n\t"
+    "move       %[iStride], $13                     \n\t"
+    "move       %[pDst], $12                        \n\t"
+
+    PTR_ADDIU  "$9, %[pDst], -0x10                  \n\t"
+
+    PTR_ADDU   "%[iHeight], %[pDst], %[iWidth]      \n\t"
+    PTR_ADDIU  "%[iHeight], %[iHeight], -0x1        \n\t"
+    PTR_ADDIU  "$11, %[iHeight], 0x1                \n\t"
+
+    "lbu        $8, 0x0(%[iHeight])                 \n\t"
+
+    MMI_Copy16Times($f16, $f18, $f28, $8)
+
+    "dnegu      %[iStride], %[iStride]              \n\t"
+    "move       $8, $10                             \n\t"
+
+    "move       $10, %[pDst]                        \n\t"
+    "move       $12, %[iStride]                     \n\t"
+    "move       $13, %[iWidth]                      \n\t"
+    "move       $14, $8                             \n\t"
+
+    exp_left_right_mmi_16_aligned
+
+    "move       $8, $14                             \n\t"
+    "move       %[iWidth], $13                      \n\t"
+    "move       %[iStride], $12                     \n\t"
+    "move       %[pDst], $10                        \n\t"
+
+    "dnegu      %[iStride], %[iStride]              \n\t"
+    PTR_ADDIU  "%[iHeight], %[pDst], -0x10          \n\t"
+    PTR_ADDU   "%[iHeight], %[iHeight], %[iStride]  \n\t"
+    PTR_ADDU   "$11, %[pDst], %[iWidth]             \n\t"
+    PTR_ADDU   "$11, $11, %[iStride]                \n\t"
+
+    "dnegu      %[iStride], %[iStride]              \n\t"
+    PTR_ADDIU  "$8, $8, 0x10                        \n\t"
+    "dmul       $8, $8, %[iStride]                  \n\t"
+
+    PTR_ADDU   "$9, %[iHeight], $8                  \n\t"
+    PTR_ADDU   "$8, $11, $8                         \n\t"
+    "dnegu      %[iStride], %[iStride]              \n\t"
+
+    exp_cross_mmi_16_aligned
+    : [pDst]"+&r"((unsigned char *)pDst), [iStride]"+&r"((int)iStride),
+      [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
+    :
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
+      "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+      "$f22", "$f24", "$f26", "$f28"
+  );
+  RECOVER_REG;
+}
--- /dev/null
+++ b/codec/common/mips/intra_pred_com_mmi.c
@@ -1,0 +1,548 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2018, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file    intra_pred_com_mmi.c
+ *
+ * \brief   Loongson optimization
+ *
+ * \date    23/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+#define MMI_PRED_H_16X16_ONE_LINE \
+  PTR_ADDIU  "%[pPred], %[pPred], 0x10                  \n\t" \
+  PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t" \
+  "lbu        $8, 0x0(%[pRef])                          \n\t" \
+  MMI_Copy16Times($f0, $f2, $f4, $8)                          \
+  "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+
+#define LOAD_2_LEFT_AND_ADD \
+  PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t" \
+  "lbu        $9, -0x1(%[pRef])                         \n\t" \
+  PTR_ADDU   "$8, $8, $9                                \n\t" \
+  PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t" \
+  "lbu        $9, -0x1(%[pRef])                         \n\t" \
+  PTR_ADDU   "$8, $8, $9                                \n\t"
+
+//f2 should be mmi_01bytes, f4 should be 0x38, f6 should be 0x0
+#define MMI_PRED_H_8X8_ONE_LINE(f0, f2, f4, f6, r0, r1, r1_offset) \
+  PTR_ADDU   ""#r0", "#r0", %[kiStride]                 \n\t" \
+  "gsldxc1    "#f0", -0x8("#r0", $0)                    \n\t" \
+  "dsrl       "#f0", "#f0", "#f4"                       \n\t" \
+  "pmullh     "#f0", "#f0", "#f2"                       \n\t" \
+  "pshufh     "#f0", "#f0", "#f6"                       \n\t" \
+  "gssdxc1    "#f0", "#r1_offset"+0x0("#r1", $0)        \n\t"
+
+void WelsI16x16LumaPredV_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+  __asm__ volatile (
+    ".set     arch=loongson3a                             \n\t"
+    PTR_SUBU   "%[pRef], %[pRef], %[kiStride]             \n\t"
+    "gslqc1     $f2, $f0, 0x0(%[pRef])                    \n\t"
+
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    "gssqc1     $f2, $f0, 0x10(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0x20(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0x30(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0x40(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0x50(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0x60(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0x70(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0x80(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0x90(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0xa0(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0xb0(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0xc0(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0xd0(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0xe0(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0xf0(%[pPred])                  \n\t"
+    : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+    : [kiStride]"r"((int)kiStride)
+    : "memory", "$f0", "$f2"
+  );
+}
+
+void WelsI16x16LumaPredH_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+  __asm__ volatile (
+    ".set     arch=loongson3a                             \n\t"
+    PTR_ADDIU  "%[pRef], %[pRef], -0x1                    \n\t"
+    "lbu        $8, 0x0(%[pRef])                          \n\t"
+    "xor        $f4, $f4, $f4                             \n\t"
+    MMI_Copy16Times($f0, $f2, $f4, $8)
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+
+    MMI_PRED_H_16X16_ONE_LINE
+    MMI_PRED_H_16X16_ONE_LINE
+    MMI_PRED_H_16X16_ONE_LINE
+    MMI_PRED_H_16X16_ONE_LINE
+    MMI_PRED_H_16X16_ONE_LINE
+    MMI_PRED_H_16X16_ONE_LINE
+    MMI_PRED_H_16X16_ONE_LINE
+    MMI_PRED_H_16X16_ONE_LINE
+    MMI_PRED_H_16X16_ONE_LINE
+    MMI_PRED_H_16X16_ONE_LINE
+    MMI_PRED_H_16X16_ONE_LINE
+    MMI_PRED_H_16X16_ONE_LINE
+    MMI_PRED_H_16X16_ONE_LINE
+    MMI_PRED_H_16X16_ONE_LINE
+    MMI_PRED_H_16X16_ONE_LINE
+    : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+    : [kiStride]"r"((int)kiStride)
+    : "memory", "$8", "$f0", "$f2", "$f4"
+  );
+}
+
+void WelsI16x16LumaPredDc_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+  unsigned char mmi_01bytes[16]__attribute__((aligned(16))) =
+                {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  __asm__ volatile (
+    ".set     arch=loongson3a                             \n\t"
+    PTR_SUBU   "%[pRef], %[pRef], %[kiStride]             \n\t"
+    "gslqc1     $f2, $f0, 0x0(%[pRef])                    \n\t"
+    "xor        $f4, $f4, $f4                             \n\t"
+    "pasubub    $f0, $f0, $f4                             \n\t"
+    "pasubub    $f2, $f2, $f4                             \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f0, $f0, $f2                             \n\t"
+
+    PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
+    "lbu        $8, -0x1(%[pRef])                         \n\t"
+    PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
+    "lbu        $9, -0x1(%[pRef])                         \n\t"
+    PTR_ADDU   "$8, $8, $9                                \n\t"
+
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+
+    "dli        $10, 0x5                                  \n\t"
+    "dmtc1      $10, $f6                                  \n\t"
+    PTR_ADDIU  "$8, 0x10                                  \n\t"
+    "dmtc1      $8, $f4                                   \n\t"
+    "paddh      $f0, $f0, $f4                             \n\t"
+    "psrlw      $f0, $f0, $f6                             \n\t"
+    "gsldxc1    $f6, 0x0(%[mmi_01bytes], $0)              \n\t"
+    "pmuluw     $f0, $f0, $f6                             \n\t"
+    "punpcklwd  $f0, $f0, $f0                             \n\t"
+    "mov.d      $f2, $f0                                  \n\t"
+
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    "gssqc1     $f2, $f0, 0x10(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0x20(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0x30(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0x40(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0x50(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0x60(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0x70(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0x80(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0x90(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0xa0(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0xb0(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0xc0(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0xd0(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0xe0(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0xf0(%[pPred])                  \n\t"
+    : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+    : [kiStride]"r"((int)kiStride), [mmi_01bytes]"r"((unsigned char *)mmi_01bytes)
+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
+  );
+}
+
+void WelsI16x16LumaPredPlane_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+  short mmi_plane_inc_minus[8]__attribute__((aligned(16))) = {-7, -6, -5, -4,
+                                                              -3, -2, -1, 0};
+  short mmi_plane_inc[8]__attribute__((aligned(16))) = {1, 2, 3, 4, 5, 6, 7, 8};
+  short mmi_plane_dec[8]__attribute__((aligned(16))) = {8, 7, 6, 5, 4, 3, 2, 1};
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    PTR_ADDIU  "%[pRef], %[pRef], -0x1                    \n\t"
+    PTR_SUBU   "%[pRef], %[pRef], %[kiStride]             \n\t"
+
+    "gsldlc1    $f0, 0x7(%[pRef])                         \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "gsldrc1    $f0, 0x0(%[pRef])                         \n\t"
+    "gslqc1     $f22, $f20, 0x0(%[mmi_plane_dec])         \n\t"
+    "punpckhbh  $f2, $f0, $f28                            \n\t"
+    "punpcklbh  $f0, $f0, $f28                            \n\t"
+    "gsldlc1    $f4, 0x10(%[pRef])                        \n\t"
+    "pmullh     $f0, $f0, $f20                            \n\t"
+    "pmullh     $f2, $f2, $f22                            \n\t"
+    "gsldrc1    $f4, 0x9(%[pRef])                         \n\t"
+    "gslqc1     $f26, $f24, 0x0(%[mmi_plane_inc])         \n\t"
+    "punpckhbh  $f6, $f4, $f28                            \n\t"
+    "punpcklbh  $f4, $f4, $f28                            \n\t"
+    "pmullh     $f4, $f4, $f24                            \n\t"
+    "pmullh     $f6, $f6, $f26                            \n\t"
+    "psubh      $f4, $f4, $f0                             \n\t"
+    "psubh      $f6, $f6, $f2                             \n\t"
+
+    "xor        $f8, $f8, $f8                             \n\t"
+    SUMH_HORIZON($f4, $f6, $f0, $f2, $f8)
+    "dmfc1      $8, $f4                                   \n\t"
+    "seh        $8, $8                                    \n\t"
+    "mul        $8, $8, 0x5                               \n\t"
+    PTR_ADDIU  "$8, $8, 0x20                              \n\t"
+    "sra        $8, $8, 0x6                               \n\t"
+    MMI_Copy8Times($f4, $f6, $f28, $8)
+
+    "lbu        $9, 0x10(%[pRef])                         \n\t"
+    PTR_ADDIU  "%[pRef], %[pRef], -0x3                    \n\t"
+    LOAD_COLUMN($f0, $f2, $f8, $f10, $f12, $f14, $f16,
+                $f18, %[pRef], %[kiStride], $11)
+
+    PTR_ADDIU  "%[pRef], %[pRef], 0x3                     \n\t"
+    "dsll       $10, %[kiStride], 0x3                     \n\t"
+    PTR_ADDU   "$10, $10, %[pRef]                         \n\t"
+    "lbu        $8, 0x0($10)                              \n\t"
+    PTR_ADDU   "$9, $9, $8                                \n\t"
+    "dsll       $9, $9, 0x4                               \n\t"
+
+    PTR_ADDIU  "%[pRef], %[pRef], -0x3                    \n\t"
+    PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
+    LOAD_COLUMN($f28, $f30, $f8, $f10, $f12, $f14, $f16,
+                $f18, %[pRef], %[kiStride], $11)
+    "xor        $f16, $f16, $f16                          \n\t"
+    "xor        $f18, $f18, $f18                          \n\t"
+    "punpcklbh  $f0, $f2, $f18                            \n\t"
+    "punpckhbh  $f2, $f2, $f18                            \n\t"
+    "pmullh     $f0, $f0, $f20                            \n\t"
+    "pmullh     $f2, $f2, $f22                            \n\t"
+    "punpcklbh  $f28, $f30, $f18                          \n\t"
+    "punpckhbh  $f30, $f30, $f18                          \n\t"
+    "pmullh     $f28, $f28, $f24                          \n\t"
+    "pmullh     $f30, $f30, $f26                          \n\t"
+    "psubh      $f28, $f28, $f0                           \n\t"
+    "psubh      $f30, $f30, $f2                           \n\t"
+
+    SUMH_HORIZON($f28, $f30, $f0, $f2, $f8)
+    "dmfc1      $8, $f28                                  \n\t"
+    "seh        $8, $8                                    \n\t"
+    "mul        $8, $8, 0x5                               \n\t"
+    PTR_ADDIU  "$8, $8, 0x20                              \n\t"
+    "sra        $8, $8, 0x6                               \n\t"
+    "xor        $f20, $f20, $f20                          \n\t"
+    MMI_Copy8Times($f16, $f18, $f20, $8)
+
+    PTR_ADDIU  "$9, $9, 0x10                              \n\t"
+    "mul        $8, $8, -0x7                              \n\t"
+    PTR_ADDU   "$8, $8, $9                                \n\t"
+    "xor        $f20, $f20, $f20                          \n\t"
+    MMI_Copy8Times($f0, $f2, $f20, $8)
+
+    "xor        $8, $8, $8                                \n\t"
+    "gslqc1     $f22, $f20, 0x0(%[mmi_plane_inc_minus])   \n\t"
+
+    "dli        $10, 0x5                                  \n\t"
+    "dmtc1      $10, $f30                                 \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    "1:                                                   \n\t"
+    "pmullh     $f8, $f4, $f20                            \n\t"
+    "pmullh     $f10, $f6, $f22                           \n\t"
+    "paddh      $f8, $f8, $f0                             \n\t"
+    "paddh      $f10, $f10, $f2                           \n\t"
+    "psrah      $f8, $f8, $f30                            \n\t"
+    "psrah      $f10, $f10, $f30                          \n\t"
+    "pmullh     $f12, $f4, $f24                           \n\t"
+    "pmullh     $f14, $f6, $f26                           \n\t"
+    "paddh      $f12, $f12, $f0                           \n\t"
+    "paddh      $f14, $f14, $f2                           \n\t"
+    "psrah      $f12, $f12, $f30                          \n\t"
+    "psrah      $f14, $f14, $f30                          \n\t"
+    "packushb   $f8, $f8, $f10                            \n\t"
+    "packushb   $f10, $f12, $f14                          \n\t"
+    "gssqc1     $f10, $f8, 0x0(%[pPred])                  \n\t"
+    "paddh      $f0, $f0, $f16                            \n\t"
+    "paddh      $f2, $f2, $f18                            \n\t"
+    PTR_ADDIU  "%[pPred], %[pPred], 0x10                  \n\t"
+    PTR_ADDIU  "$8, $8, 0x1                               \n\t"
+    PTR_ADDIU  "$10, $8, -0x10                            \n\t"
+    "bnez       $10, 1b                                   \n\t"
+    "nop                                                  \n\t"
+    : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+    : [kiStride]"r"((int)kiStride), [mmi_plane_inc_minus]"r"(mmi_plane_inc_minus),
+      [mmi_plane_inc]"r"(mmi_plane_inc), [mmi_plane_dec]"r"(mmi_plane_dec)
+    : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8",
+      "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
+      "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void WelsIChromaPredPlane_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+  short mmi_plane_inc_c[4]__attribute__((aligned(16))) = {1, 2, 3, 4};
+  short mmi_plane_dec_c[4]__attribute__((aligned(16))) = {4, 3, 2, 1};
+  short mmi_plane_mul_b_c[8]__attribute__((aligned(16))) = {-3, -2, -1, 0,
+                                                            1, 2, 3, 4};
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    PTR_ADDIU  "%[pRef], %[pRef], -0x1                    \n\t"
+    PTR_SUBU   "%[pRef], %[pRef], %[kiStride]             \n\t"
+
+    "gsldlc1    $f0, 0x7(%[pRef])                         \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "gsldrc1    $f0, 0x0(%[pRef])                         \n\t"
+    "gsldxc1    $f20, 0x0(%[mmi_plane_dec_c], $0)         \n\t"
+    "punpcklbh  $f0, $f0, $f28                            \n\t"
+    "gsldlc1    $f4, 0xc(%[pRef])                         \n\t"
+    "pmullh     $f0, $f0, $f20                            \n\t"
+    "gsldrc1    $f4, 0x5(%[pRef])                         \n\t"
+    "gsldxc1    $f24, 0x0(%[mmi_plane_inc_c], $0)         \n\t"
+    "punpcklbh  $f4, $f4, $f28                            \n\t"
+    "pmullh     $f4, $f4, $f24                            \n\t"
+    "psubh      $f4, $f4, $f0                             \n\t"
+
+    "xor        $f6, $f6, $f6                             \n\t"
+    "xor        $f8, $f8, $f8                             \n\t"
+    SUMH_HORIZON($f4, $f6, $f0, $f2, $f8)
+    "dmfc1      $8, $f4                                   \n\t"
+    "seh        $8, $8                                    \n\t"
+    "mul        $8, $8, 0x11                              \n\t"
+    PTR_ADDIU  "$8, $8, 0x10                              \n\t"
+    "sra        $8, $8, 0x5                               \n\t"
+    MMI_Copy8Times($f4, $f6, $f28, $8)
+
+    "lbu        $8, 0x8(%[pRef])                          \n\t"
+    PTR_ADDIU  "%[pRef], %[pRef], -0x3                    \n\t"
+    LOAD_COLUMN_C($f0, $f8, $f12, $f16, %[pRef], %[kiStride], $10)
+
+    PTR_ADDIU  "%[pRef], %[pRef], 0x3                     \n\t"
+    "dsll       $10, %[kiStride], 0x2                     \n\t"
+    PTR_ADDU   "$10, $10, %[pRef]                         \n\t"
+    "lbu        $9, 0x0($10)                              \n\t"
+    PTR_ADDU   "$9, $9, $8                                \n\t"
+    "dsll       $9, $9, 0x4                               \n\t"
+
+    PTR_ADDIU  "%[pRef], %[pRef], -0x3                    \n\t"
+    PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
+    LOAD_COLUMN_C($f28, $f8, $f12, $f16, %[pRef], %[kiStride], $10)
+    "xor        $f16, $f16, $f16                          \n\t"
+    "punpckhbh  $f0, $f0, $f16                            \n\t"
+    "pmullh     $f0, $f0, $f20                            \n\t"
+    "punpckhbh  $f28, $f28, $f16                          \n\t"
+    "pmullh     $f28, $f28, $f24                          \n\t"
+    "psubh      $f28, $f28, $f0                           \n\t"
+
+    "xor        $f30, $f30, $f30                          \n\t"
+    "xor        $f8, $f8, $f8                             \n\t"
+    SUMH_HORIZON($f28, $f30, $f0, $f2, $f8)
+    "dmfc1      $8, $f28                                  \n\t"
+    "seh        $8, $8                                    \n\t"
+    "mul        $8, $8, 0x11                              \n\t"
+    PTR_ADDIU  "$8, $8, 0x10                              \n\t"
+    "sra        $8, $8, 0x5                               \n\t"
+    MMI_Copy8Times($f16, $f18, $f8, $8)
+
+    PTR_ADDIU  "$9, $9, 0x10                              \n\t"
+    "mul        $8, $8, -0x3                              \n\t"
+    PTR_ADDU   "$8, $8, $9                                \n\t"
+    MMI_Copy8Times($f0, $f2, $f8, $8)
+
+    "xor        $8, $8, $8                                \n\t"
+    "gslqc1     $f22, $f20, 0x0(%[mmi_plane_mul_b_c])     \n\t"
+
+    "dli        $10, 0x5                                  \n\t"
+    "dmtc1      $10, $f30                                 \n\t"
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+
+    "1:                                                   \n\t"
+    "pmullh     $f8, $f4, $f20                            \n\t"
+    "pmullh     $f10, $f6, $f22                           \n\t"
+    "paddh      $f8, $f8, $f0                             \n\t"
+    "paddh      $f10, $f10, $f2                           \n\t"
+    "psrah      $f8, $f8, $f30                            \n\t"
+    "psrah      $f10, $f10, $f30                          \n\t"
+    "packushb   $f8, $f8, $f10                            \n\t"
+    "gssdxc1    $f8, 0x0(%[pPred], $0)                    \n\t"
+    "paddh      $f0, $f0, $f16                            \n\t"
+    "paddh      $f2, $f2, $f18                            \n\t"
+    PTR_ADDIU  "%[pPred], %[pPred], 0x8                   \n\t"
+    PTR_ADDIU  "$8, $8, 0x1                               \n\t"
+    PTR_ADDIU  "$10, $8, -0x8                             \n\t"
+    "bnez       $10, 1b                                   \n\t"
+    "nop                                                  \n\t"
+    : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+    : [kiStride]"r"((int)kiStride), [mmi_plane_mul_b_c]"r"(mmi_plane_mul_b_c),
+      [mmi_plane_inc_c]"r"(mmi_plane_inc_c), [mmi_plane_dec_c]"r"(mmi_plane_dec_c)
+    : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void WelsIChromaPredV_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    PTR_SUBU   "%[pRef], %[pRef], %[kiStride]             \n\t"
+    "gsldxc1    $f0, 0x0(%[pRef], $0)                     \n\t"
+    "mov.d      $f2, $f0                                  \n\t"
+
+    "gssqc1     $f2, $f0, 0x0(%[pPred])                   \n\t"
+    "gssqc1     $f2, $f0, 0x10(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0x20(%[pPred])                  \n\t"
+    "gssqc1     $f2, $f0, 0x30(%[pPred])                  \n\t"
+    : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+    : [kiStride]"r"((int)kiStride)
+    : "memory", "$f0", "$f2"
+  );
+}
+
+void WelsIChromaPredDc_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+  short mmi_0x02[4]__attribute__((aligned(16))) = {2, 0, 0, 0};
+  unsigned char mmi_01bytes[16]__attribute__((aligned(16))) =
+                {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    PTR_SUBU   "%[pRef], %[pRef], %[kiStride]             \n\t"
+    "gsldxc1    $f0, 0x0(%[pRef], $0)                     \n\t"
+
+    PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
+    "lbu        $8, -0x1(%[pRef])                         \n\t"
+    PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
+    "lbu        $9, -0x1(%[pRef])                         \n\t"
+    PTR_ADDU   "$8, $8, $9                                \n\t"
+    PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
+    "lbu        $9, -0x1(%[pRef])                         \n\t"
+    PTR_ADDU   "$8, $8, $9                                \n\t"
+    PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
+    "lbu        $9, -0x1(%[pRef])                         \n\t"
+    PTR_ADDU   "$8, $8, $9                                \n\t"
+    "dmtc1      $8, $f2                                   \n\t"
+
+    PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
+    "lbu        $8, -0x1(%[pRef])                         \n\t"
+    PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
+    "lbu        $9, -0x1(%[pRef])                         \n\t"
+    PTR_ADDU   "$8, $8, $9                                \n\t"
+    PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
+    "lbu        $9, -0x1(%[pRef])                         \n\t"
+    PTR_ADDU   "$8, $8, $9                                \n\t"
+    PTR_ADDU   "%[pRef], %[pRef], %[kiStride]             \n\t"
+    "lbu        $9, -0x1(%[pRef])                         \n\t"
+    PTR_ADDU   "$8, $8, $9                                \n\t"
+    "dmtc1      $8, $f4                                   \n\t"
+
+    "xor        $f8, $f8, $f8                             \n\t"
+    "punpcklwd  $f6, $f0, $f8                             \n\t"
+    "punpckhwd  $f0, $f0, $f8                             \n\t"
+    "pasubub    $f0, $f0, $f8                             \n\t"
+    "pasubub    $f6, $f6, $f8                             \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f6, $f6                                  \n\t"
+
+    "dadd       $f6, $f6, $f2                             \n\t"
+    "dadd       $f2, $f4, $f0                             \n\t"
+
+    "gsldxc1    $f8, 0x0(%[mmi_0x02], $0)                 \n\t"
+
+    "dli        $10, 0x2                                  \n\t"
+    "dmtc1      $10, $f10                                 \n\t"
+    "dadd       $f0, $f0, $f8                             \n\t"
+    "dsrl       $f0, $f0, $f10                            \n\t"
+
+    "dadd       $f4, $f4, $f8                             \n\t"
+    "dsrl       $f4, $f4, $f10                            \n\t"
+
+    "dli        $10, 0x3                                  \n\t"
+    "dmtc1      $10, $f10                                 \n\t"
+    "dadd       $f6, $f6, $f8                             \n\t"
+    "dadd       $f6, $f6, $f8                             \n\t"
+    "dsrl       $f6, $f6, $f10                            \n\t"
+
+    "dadd       $f2, $f2, $f8                             \n\t"
+    "dadd       $f2, $f2, $f8                             \n\t"
+    "dsrl       $f2, $f2, $f10                            \n\t"
+
+    "dli        $10, 0x20                                 \n\t"
+    "dmtc1      $10, $f10                                 \n\t"
+    "gsldxc1    $f12, 0x0(%[mmi_01bytes], $0)             \n\t"
+    "pmuluw     $f0, $f0, $f12                            \n\t"
+    "pmuluw     $f6, $f6, $f12                            \n\t"
+    "dsll       $f0, $f0, $f10                            \n\t"
+    "xor        $f0, $f0, $f6                             \n\t"
+
+    "pmuluw     $f4, $f4, $f12                            \n\t"
+    "pmuluw     $f2, $f2, $f12                            \n\t"
+    "dsll       $f2, $f2, $f10                            \n\t"
+    "xor        $f2, $f2, $f4                             \n\t"
+
+    "gssdxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
+    "gssdxc1    $f0, 0x8(%[pPred], $0)                    \n\t"
+    "gssdxc1    $f0, 0x10(%[pPred], $0)                   \n\t"
+    "gssdxc1    $f0, 0x18(%[pPred], $0)                   \n\t"
+
+    "gssdxc1    $f2, 0x20(%[pPred], $0)                   \n\t"
+    "gssdxc1    $f2, 0x28(%[pPred], $0)                   \n\t"
+    "gssdxc1    $f2, 0x30(%[pPred], $0)                   \n\t"
+    "gssdxc1    $f2, 0x38(%[pPred], $0)                   \n\t"
+    : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+    : [kiStride]"r"((int)kiStride), [mmi_01bytes]"r"((unsigned char *)mmi_01bytes),
+      [mmi_0x02]"r"((unsigned char *)mmi_0x02)
+    : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12"
+  );
+}
+
+void WelsIChromaPredH_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+  unsigned char mmi_01bytes[16]__attribute__((aligned(16))) =
+                {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "gsldxc1    $f2, 0x0(%[mmi_01bytes], $0)              \n\t"
+    "dli        $8, 0x38                                  \n\t"
+    "dmtc1      $8, $f4                                   \n\t"
+    "xor        $f6, $f6, $f6                             \n\t"
+    "gsldxc1    $f0, -0x8(%[pRef], $0)                    \n\t"
+    "dsrl       $f0, $f0, $f4                             \n\t"
+
+    "pmullh     $f0, $f0, $f2                             \n\t"
+    "pshufh     $f0, $f0, $f6                             \n\t"
+    "gssdxc1    $f0, 0x0(%[pPred], $0)                    \n\t"
+
+    MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x8)
+    MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x10)
+    MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x18)
+    MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x20)
+    MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x28)
+    MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x30)
+    MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x38)
+   : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+   : [kiStride]"r"((int)kiStride), [mmi_01bytes]"r"((unsigned char *)mmi_01bytes)
+   : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
+  );
+}
--- /dev/null
+++ b/codec/common/mips/satd_sad_mmi.c
@@ -1,0 +1,2154 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2018, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file    satd_sad_mmi.c
+ *
+ * \brief   Loongson optimization
+ *
+ * \date    23/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+#define MMI_SumWHorizon1(f0, f2, f4, f6, f8, f10, r0) \
+  "dli        "#r0", 0x10                               \n\t" \
+  "dmtc1      "#r0", "#f8"                              \n\t" \
+  "dli        "#r0", 0x20                               \n\t" \
+  "dmtc1      "#r0", "#f10"                             \n\t" \
+  "mov.d      "#f4", "#f2"                              \n\t" \
+  "xor        "#f6", "#f6", "#f6"                       \n\t" \
+  "paddush    "#f0", "#f0", "#f4"                       \n\t" \
+  "paddush    "#f2", "#f2", "#f6"                       \n\t" \
+  "dsrl       "#f6", "#f2", "#f10"                      \n\t" \
+  "punpcklwd  "#f4", "#f2", "#f2"                       \n\t" \
+  "punpckhwd  "#f4", "#f0", "#f4"                       \n\t" \
+  "paddush    "#f0", "#f0", "#f4"                       \n\t" \
+  "paddush    "#f2", "#f2", "#f6"                       \n\t" \
+  "dsrl       "#f4", "#f0", "#f8"                       \n\t" \
+  "pinsrh_3   "#f4", "#f4", "#f2"                       \n\t" \
+  "dsrl       "#f6", "#f2", "#f8"                       \n\t" \
+  "paddush    "#f0", "#f0", "#f4"                       \n\t" \
+  "paddush    "#f2", "#f2", "#f6"                       \n\t"
+
+#define MMI_GetSad8x4 \
+  PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t" \
+  "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t" \
+  "gsldlc1    $f4, 0x7($8)                              \n\t" \
+  "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t" \
+  "gsldrc1    $f4, 0x0($8)                              \n\t" \
+  PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t" \
+  PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t" \
+  "gsldlc1    $f2, 0x7(%[pSample1])                     \n\t" \
+  "gsldlc1    $f6, 0x7($8)                              \n\t" \
+  "gsldlc1    $f8, 0x7(%[pSample2])                     \n\t" \
+  "gsldrc1    $f2, 0x0(%[pSample1])                     \n\t" \
+  "gsldrc1    $f6, 0x0($8)                              \n\t" \
+  "gsldrc1    $f8, 0x0(%[pSample2])                     \n\t" \
+  PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t" \
+  PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t" \
+  "gsldlc1    $f12, 0x7($9)                             \n\t" \
+  "gsldlc1    $f10, 0x7(%[pSample2])                    \n\t" \
+  "gsldrc1    $f12, 0x0($9)                             \n\t" \
+  PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t" \
+  "gsldrc1    $f10, 0x0(%[pSample2])                    \n\t" \
+  "gsldlc1    $f14, 0x7($9)                             \n\t" \
+  "gsldrc1    $f14, 0x0($9)                             \n\t" \
+  "pasubub    $f0, $f0, $f8                             \n\t" \
+  PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t" \
+  "pasubub    $f2, $f2, $f10                            \n\t" \
+  "biadd      $f0, $f0                                  \n\t" \
+  "biadd      $f2, $f2                                  \n\t" \
+  "pasubub    $f4, $f4, $f12                            \n\t" \
+  PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t" \
+  "pasubub    $f6, $f6, $f14                            \n\t" \
+  "biadd      $f4, $f4                                  \n\t" \
+  "biadd      $f6, $f6                                  \n\t" \
+  "paddh      $f24, $f24, $f0                           \n\t" \
+  "paddh      $f26, $f26, $f2                           \n\t" \
+  "paddh      $f24, $f24, $f4                           \n\t" \
+  "paddh      $f26, $f26, $f6                           \n\t"
+
+#define MMI_GetSad8x4_End \
+  PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t" \
+  "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t" \
+  "gsldlc1    $f4, 0x7($8)                              \n\t" \
+  "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t" \
+  "gsldrc1    $f4, 0x0($8)                              \n\t" \
+  PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t" \
+  PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t" \
+  "gsldlc1    $f2, 0x7(%[pSample1])                     \n\t" \
+  "gsldlc1    $f6, 0x7($8)                              \n\t" \
+  "gsldlc1    $f8, 0x7(%[pSample2])                     \n\t" \
+  "gsldrc1    $f2, 0x0(%[pSample1])                     \n\t" \
+  "gsldrc1    $f6, 0x0($8)                              \n\t" \
+  "gsldrc1    $f8, 0x0(%[pSample2])                     \n\t" \
+  PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t" \
+  PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t" \
+  "gsldlc1    $f12, 0x7($9)                             \n\t" \
+  "gsldlc1    $f10, 0x7(%[pSample2])                    \n\t" \
+  "gsldrc1    $f12, 0x0($9)                             \n\t" \
+  PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t" \
+  "gsldrc1    $f10, 0x0(%[pSample2])                    \n\t" \
+  "gsldlc1    $f14, 0x7($9)                             \n\t" \
+  "gsldrc1    $f14, 0x0($9)                             \n\t" \
+  "pasubub    $f0, $f0, $f8                             \n\t" \
+  "pasubub    $f2, $f2, $f10                            \n\t" \
+  "biadd      $f0, $f0                                  \n\t" \
+  "biadd      $f2, $f2                                  \n\t" \
+  "pasubub    $f4, $f4, $f12                            \n\t" \
+  "pasubub    $f6, $f6, $f14                            \n\t" \
+  "biadd      $f4, $f4                                  \n\t" \
+  "biadd      $f6, $f6                                  \n\t" \
+  "paddh      $f24, $f24, $f0                           \n\t" \
+  "paddh      $f26, $f26, $f2                           \n\t" \
+  "paddh      $f24, $f24, $f4                           \n\t" \
+  "paddh      $f26, $f26, $f6                           \n\t"
+
+#define CACHE_SPLIT_CHECK(r0, width, cacheline) \
+  "and        "#r0", "#r0", 0x1f                        \n\t" \
+  PTR_ADDIU  ""#r0", "#r0", -0x1f                       \n\t"
+
+#define MMI_GetSad2x16 \
+  PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
+  PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
+  "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t" \
+  "gsldlc1    $f6, 0xF(%[pSample2])                     \n\t" \
+  "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t" \
+  "gsldrc1    $f6, 0x8(%[pSample2])                     \n\t" \
+  "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
+  "pasubub    $f4, $f4, $f8                             \n\t" \
+  "pasubub    $f6, $f6, $f10                            \n\t" \
+  "biadd      $f4, $f4                                  \n\t" \
+  "biadd      $f6, $f6                                  \n\t" \
+  "paddh      $f0, $f0, $f4                             \n\t" \
+  "paddh      $f2, $f2, $f6                             \n\t" \
+  PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
+  "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t" \
+  "gsldlc1    $f6, 0xF(%[pSample2])                     \n\t" \
+  "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t" \
+  "gsldrc1    $f6, 0x8(%[pSample2])                     \n\t" \
+  PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
+  "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
+  "pasubub    $f4, $f4, $f8                             \n\t" \
+  "pasubub    $f6, $f6, $f10                            \n\t" \
+  "biadd      $f4, $f4                                  \n\t" \
+  "biadd      $f6, $f6                                  \n\t" \
+  "paddh      $f0, $f0, $f4                             \n\t" \
+  "paddh      $f2, $f2, $f6                             \n\t"
+
+#define MMI_GetSad4x16 \
+  "gsldlc1    $f0, 0x7(%[pSample2])                     \n\t" \
+  "gsldlc1    $f2, 0xF(%[pSample2])                     \n\t" \
+  "gsldrc1    $f0, 0x0(%[pSample2])                     \n\t" \
+  "gsldrc1    $f2, 0x8(%[pSample2])                     \n\t" \
+  "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
+  PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
+  PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
+  "pasubub    $f0, $f0, $f8                             \n\t" \
+  "pasubub    $f2, $f2, $f10                            \n\t" \
+  "biadd      $f0, $f0                                  \n\t" \
+  "biadd      $f2, $f2                                  \n\t" \
+  "paddh      $f28, $f28, $f0                           \n\t" \
+  "paddh      $f30, $f30, $f2                           \n\t" \
+  "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t" \
+  "gsldlc1    $f6, 0xF(%[pSample2])                     \n\t" \
+  "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t" \
+  "gsldrc1    $f6, 0x8(%[pSample2])                     \n\t" \
+  "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
+  PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
+  PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
+  "pasubub    $f4, $f4, $f8                             \n\t" \
+  "pasubub    $f6, $f6, $f10                            \n\t" \
+  "biadd      $f4, $f4                                  \n\t" \
+  "biadd      $f6, $f6                                  \n\t" \
+  "paddh      $f28, $f28, $f4                           \n\t" \
+  "paddh      $f30, $f30, $f6                           \n\t" \
+  "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t" \
+  "gsldlc1    $f6, 0xF(%[pSample2])                     \n\t" \
+  "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t" \
+  "gsldrc1    $f6, 0x8(%[pSample2])                     \n\t" \
+  "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
+  PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
+  PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
+  "pasubub    $f4, $f4, $f8                             \n\t" \
+  "pasubub    $f6, $f6, $f10                            \n\t" \
+  "biadd      $f4, $f4                                  \n\t" \
+  "biadd      $f6, $f6                                  \n\t" \
+  "paddh      $f28, $f28, $f4                           \n\t" \
+  "paddh      $f30, $f30, $f6                           \n\t" \
+  "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t" \
+  "gsldlc1    $f6, 0xF(%[pSample2])                     \n\t" \
+  "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t" \
+  "gsldrc1    $f6, 0x8(%[pSample2])                     \n\t" \
+  "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
+  PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
+  PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
+  "pasubub    $f4, $f4, $f8                             \n\t" \
+  "pasubub    $f6, $f6, $f10                            \n\t" \
+  "biadd      $f4, $f4                                  \n\t" \
+  "biadd      $f6, $f6                                  \n\t" \
+  "paddh      $f28, $f28, $f4                           \n\t" \
+  "paddh      $f30, $f30, $f6                           \n\t"
+
+#define MMI_GetSad4x16_Aligned \
+  "gslqc1     $f2, $f0, 0x0(%[pSample2])                \n\t" \
+  "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
+  PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
+  PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
+  "pasubub    $f0, $f0, $f8                             \n\t" \
+  "pasubub    $f2, $f2, $f10                            \n\t" \
+  "biadd      $f0, $f0                                  \n\t" \
+  "biadd      $f2, $f2                                  \n\t" \
+  "paddh      $f28, $f28, $f0                           \n\t" \
+  "paddh      $f30, $f30, $f2                           \n\t" \
+  "gslqc1     $f6, $f4, 0x0(%[pSample2])                \n\t" \
+  "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
+  PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
+  PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
+  "pasubub    $f4, $f4, $f8                             \n\t" \
+  "pasubub    $f6, $f6, $f10                            \n\t" \
+  "biadd      $f4, $f4                                  \n\t" \
+  "biadd      $f6, $f6                                  \n\t" \
+  "paddh      $f28, $f28, $f4                           \n\t" \
+  "paddh      $f30, $f30, $f6                           \n\t" \
+  "gslqc1     $f6, $f4, 0x0(%[pSample2])                \n\t" \
+  "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
+  PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
+  PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
+  "pasubub    $f4, $f4, $f8                             \n\t" \
+  "pasubub    $f6, $f6, $f10                            \n\t" \
+  "biadd      $f4, $f4                                  \n\t" \
+  "biadd      $f6, $f6                                  \n\t" \
+  "paddh      $f28, $f28, $f4                           \n\t" \
+  "paddh      $f30, $f30, $f6                           \n\t" \
+  "gslqc1     $f6, $f4, 0x0(%[pSample2])                \n\t" \
+  "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
+  PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
+  PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
+  "pasubub    $f4, $f4, $f8                             \n\t" \
+  "pasubub    $f6, $f6, $f10                            \n\t" \
+  "biadd      $f4, $f4                                  \n\t" \
+  "biadd      $f6, $f6                                  \n\t" \
+  "paddh      $f28, $f28, $f4                           \n\t" \
+  "paddh      $f30, $f30, $f6                           \n\t"
+
+#define MMI_GetSad4x16_End \
+  "gsldlc1    $f0, 0x7(%[pSample2])                     \n\t" \
+  "gsldlc1    $f2, 0xF(%[pSample2])                     \n\t" \
+  "gsldrc1    $f0, 0x0(%[pSample2])                     \n\t" \
+  "gsldrc1    $f2, 0x8(%[pSample2])                     \n\t" \
+  "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
+  PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
+  PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
+  "pasubub    $f0, $f0, $f8                             \n\t" \
+  "pasubub    $f2, $f2, $f10                            \n\t" \
+  "biadd      $f0, $f0                                  \n\t" \
+  "biadd      $f2, $f2                                  \n\t" \
+  "paddh      $f28, $f28, $f0                           \n\t" \
+  "paddh      $f30, $f30, $f2                           \n\t" \
+  "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t" \
+  "gsldlc1    $f6, 0xF(%[pSample2])                     \n\t" \
+  "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t" \
+  "gsldrc1    $f6, 0x8(%[pSample2])                     \n\t" \
+  "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
+  PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
+  PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
+  "pasubub    $f4, $f4, $f8                             \n\t" \
+  "pasubub    $f6, $f6, $f10                            \n\t" \
+  "biadd      $f4, $f4                                  \n\t" \
+  "biadd      $f6, $f6                                  \n\t" \
+  "paddh      $f28, $f28, $f4                           \n\t" \
+  "paddh      $f30, $f30, $f6                           \n\t" \
+  "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t" \
+  "gsldlc1    $f6, 0xF(%[pSample2])                     \n\t" \
+  "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t" \
+  "gsldrc1    $f6, 0x8(%[pSample2])                     \n\t" \
+  "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
+  PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
+  PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
+  "pasubub    $f4, $f4, $f8                             \n\t" \
+  "pasubub    $f6, $f6, $f10                            \n\t" \
+  "biadd      $f4, $f4                                  \n\t" \
+  "biadd      $f6, $f6                                  \n\t" \
+  "paddh      $f28, $f28, $f4                           \n\t" \
+  "paddh      $f30, $f30, $f6                           \n\t" \
+  "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t" \
+  "gsldlc1    $f6, 0xF(%[pSample2])                     \n\t" \
+  "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t" \
+  "gsldrc1    $f6, 0x8(%[pSample2])                     \n\t" \
+  "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
+  "pasubub    $f4, $f4, $f8                             \n\t" \
+  "pasubub    $f6, $f6, $f10                            \n\t" \
+  "biadd      $f4, $f4                                  \n\t" \
+  "biadd      $f6, $f6                                  \n\t" \
+  "paddh      $f28, $f28, $f4                           \n\t" \
+  "paddh      $f30, $f30, $f6                           \n\t"
+
+#define MMI_GetSad4x16_Aligned_End \
+  "gslqc1     $f2, $f0, 0x0(%[pSample2])                \n\t" \
+  "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
+  PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
+  PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
+  "pasubub    $f0, $f0, $f8                             \n\t" \
+  "pasubub    $f2, $f2, $f10                            \n\t" \
+  "biadd      $f0, $f0                                  \n\t" \
+  "biadd      $f2, $f2                                  \n\t" \
+  "paddh      $f28, $f28, $f0                           \n\t" \
+  "paddh      $f30, $f30, $f2                           \n\t" \
+  "gslqc1     $f6, $f4, 0x0(%[pSample2])                \n\t" \
+  "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
+  PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
+  PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
+  "pasubub    $f4, $f4, $f8                             \n\t" \
+  "pasubub    $f6, $f6, $f10                            \n\t" \
+  "biadd      $f4, $f4                                  \n\t" \
+  "biadd      $f6, $f6                                  \n\t" \
+  "paddh      $f28, $f28, $f4                           \n\t" \
+  "paddh      $f30, $f30, $f6                           \n\t" \
+  "gslqc1     $f6, $f4, 0x0(%[pSample2])                \n\t" \
+  "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
+  PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t" \
+  PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t" \
+  "pasubub    $f4, $f4, $f8                             \n\t" \
+  "pasubub    $f6, $f6, $f10                            \n\t" \
+  "biadd      $f4, $f4                                  \n\t" \
+  "biadd      $f6, $f6                                  \n\t" \
+  "paddh      $f28, $f28, $f4                           \n\t" \
+  "paddh      $f30, $f30, $f6                           \n\t" \
+  "gslqc1     $f6, $f4, 0x0(%[pSample2])                \n\t" \
+  "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t" \
+  "pasubub    $f4, $f4, $f8                             \n\t" \
+  "pasubub    $f6, $f6, $f10                            \n\t" \
+  "biadd      $f4, $f4                                  \n\t" \
+  "biadd      $f6, $f6                                  \n\t" \
+  "paddh      $f28, $f28, $f4                           \n\t" \
+  "paddh      $f30, $f30, $f6                           \n\t"
+
+#define MMI_Get4LW16Sad(f0, f2, f4, f6, f8, f10, f12, f14, r0) \
+  "pasubub    "#f0", "#f0", "#f12"                      \n\t" \
+  "pasubub    "#f2", "#f2", "#f14"                      \n\t" \
+  "pasubub    "#f12", "#f12", "#f8"                     \n\t" \
+  "pasubub    "#f14", "#f14", "#f10"                    \n\t" \
+  "biadd      "#f0", "#f0"                              \n\t" \
+  "biadd      "#f2", "#f2"                              \n\t" \
+  "biadd      "#f12", "#f12"                            \n\t" \
+  "biadd      "#f14", "#f14"                            \n\t" \
+  "paddh      $f20, $f20, "#f0"                         \n\t" \
+  "paddh      $f22, $f22, "#f2"                         \n\t" \
+  "paddh      $f16, $f16, "#f12"                        \n\t" \
+  "paddh      $f18, $f18, "#f14"                        \n\t" \
+  "gsldlc1    "#f12", 0x6("#r0")                        \n\t" \
+  "gsldlc1    "#f14", 0xE("#r0")                        \n\t" \
+  "gsldrc1    "#f12", -0x1("#r0")                       \n\t" \
+  "gsldrc1    "#f14", 0x7("#r0")                        \n\t" \
+  "pasubub    "#f12", "#f12", "#f4"                     \n\t" \
+  "pasubub    "#f14", "#f14", "#f6"                     \n\t" \
+  "biadd      "#f12", "#f12"                            \n\t" \
+  "biadd      "#f14", "#f14"                            \n\t" \
+  "paddh      $f24, $f24, "#f12"                        \n\t" \
+  "paddh      $f26, $f26, "#f14"                        \n\t" \
+  "gsldlc1    "#f12", 0x8("#r0")                        \n\t" \
+  "gsldlc1    "#f14", 0x10("#r0")                       \n\t" \
+  "gsldrc1    "#f12", 0x1("#r0")                        \n\t" \
+  "gsldrc1    "#f14", 0x9("#r0")                        \n\t" \
+  "pasubub    "#f12", "#f12", "#f4"                     \n\t" \
+  "pasubub    "#f14", "#f14", "#f6"                     \n\t" \
+  "biadd      "#f12", "#f12"                            \n\t" \
+  "biadd      "#f14", "#f14"                            \n\t" \
+  "paddh      $f28, $f28, "#f12"                        \n\t" \
+  "paddh      $f30, $f30, "#f14"                        \n\t"
+
+#define MMI_HDMTwo4x4(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
+  MMI_SumSub(f0, f2, f4, f6, f16, f18)      \
+  MMI_SumSub(f8, f10, f12, f14, f16, f18)   \
+  MMI_SumSub(f4, f6, f12, f14, f16, f18)    \
+  MMI_SumSub(f0, f2, f8, f10, f16, f18)
+
+#define MMI_SumAbs4(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22, f24, f26) \
+  WELS_AbsH(f0, f2, f0, f2, f8, f10)                          \
+  WELS_AbsH(f4, f6, f4, f6, f8, f10)                          \
+  WELS_AbsH(f12, f14, f12, f14, f20, f22)                     \
+  WELS_AbsH(f16, f18, f16, f18, f20, f22)                     \
+  "paddush    "#f0", "#f0", "#f4"                       \n\t" \
+  "paddush    "#f2", "#f2", "#f6"                       \n\t" \
+  "paddush    "#f12", "#f12", "#f16"                    \n\t" \
+  "paddush    "#f14", "#f14", "#f18"                    \n\t" \
+  "paddush    "#f24", "#f24", "#f0"                     \n\t" \
+  "paddush    "#f26", "#f26", "#f2"                     \n\t" \
+  "paddush    "#f24", "#f24", "#f12"                    \n\t" \
+  "paddush    "#f26", "#f26", "#f14"                    \n\t"
+
+#define MMI_SumWHorizon(f0, f2, f4, f6, f8, f10) \
+  "paddh      "#f0", "#f0", "#f2"                       \n\t" \
+  "punpckhhw  "#f2", "#f0", "#f8"                       \n\t" \
+  "punpcklhw  "#f0", "#f0", "#f8"                       \n\t" \
+  "paddw      "#f0", "#f0", "#f2"                       \n\t" \
+  "pshufh     "#f2", "#f0", "#f10"                      \n\t" \
+  "paddw      "#f0", "#f0", "#f2"                       \n\t"
+
+#define MMI_LoadDiff8P_Offset_Stride0(f0, f2, f4, f6, f8, r0, r1) \
+  "gsldlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gsldlc1    "#f4", 0x7("#r1")               \n\t" \
+  PTR_ADDU   "$11, %[pSample1], %[iStride1]   \n\t" \
+  "gsldrc1    "#f0", 0x0("#r0")               \n\t" \
+  "gsldrc1    "#f4", 0x0("#r1")               \n\t" \
+  PTR_ADDU   "$12, %[pSample2], %[iStride2]   \n\t" \
+  "punpckhbh  "#f2", "#f0", "#f8"             \n\t" \
+  "punpcklbh  "#f0", "#f0", "#f8"             \n\t" \
+  "punpckhbh  "#f6", "#f4", "#f8"             \n\t" \
+  "punpcklbh  "#f4", "#f4", "#f8"             \n\t" \
+  "psubh      "#f0", "#f0", "#f4"             \n\t" \
+  "psubh      "#f2", "#f2", "#f6"             \n\t"
+
+#define MMI_LoadDiff8P_Offset_Stride1(f0, f2, f4, f6, f8, r0, r1) \
+  "gsldlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gsldlc1    "#f4", 0x7("#r1")               \n\t" \
+  PTR_ADDU   "%[pSample1], $11, %[iStride1]   \n\t" \
+  "gsldrc1    "#f0", 0x0("#r0")               \n\t" \
+  "gsldrc1    "#f4", 0x0("#r1")               \n\t" \
+  PTR_ADDU   "%[pSample2], $12, %[iStride2]   \n\t" \
+  "punpckhbh  "#f2", "#f0", "#f8"             \n\t" \
+  "punpcklbh  "#f0", "#f0", "#f8"             \n\t" \
+  "punpckhbh  "#f6", "#f4", "#f8"             \n\t" \
+  "punpcklbh  "#f4", "#f4", "#f8"             \n\t" \
+  "psubh      "#f0", "#f0", "#f4"             \n\t" \
+  "psubh      "#f2", "#f2", "#f6"             \n\t"
+
+#define MMI_LoadDiff8P_Offset8(f0, f2, f4, f6, f8, r0, r1) \
+  "gsldlc1    "#f0", 0x7("#r0")               \n\t" \
+  "gsldlc1    "#f4", 0x7("#r1")               \n\t" \
+  PTR_ADDU   "%[pSample1], $9, 0x8            \n\t" \
+  "gsldrc1    "#f0", 0x0("#r0")               \n\t" \
+  "gsldrc1    "#f4", 0x0("#r1")               \n\t" \
+  PTR_ADDU   "%[pSample2], $10, 0x8           \n\t" \
+  "punpckhbh  "#f2", "#f0", "#f8"             \n\t" \
+  "punpcklbh  "#f0", "#f0", "#f8"             \n\t" \
+  "punpckhbh  "#f6", "#f4", "#f8"             \n\t" \
+  "punpcklbh  "#f4", "#f4", "#f8"             \n\t" \
+  "psubh      "#f0", "#f0", "#f4"             \n\t" \
+  "psubh      "#f2", "#f2", "#f6"             \n\t"
+
+#define MMI_GetSatd8x8 \
+  MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2])        \
+  MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12)                        \
+  MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2])       \
+  MMI_LoadDiff8P_Offset_Stride1($f12, $f14, $f20, $f22, $f28, $11, $12)                      \
+  MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18)                       \
+  MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18)                    \
+  MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22)                     \
+  MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26) \
+  MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2])        \
+  MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12)                        \
+  MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2])       \
+  MMI_LoadDiff8P_Offset_Stride1($f12, $f14, $f20, $f22, $f28, $11, $12)                      \
+  MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18)                       \
+  MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18)                    \
+  MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22)                     \
+  MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26)
+
+#define MMI_GetSatd8x8_Offset8 \
+  MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2])        \
+  MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12)                        \
+  MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2])       \
+  MMI_LoadDiff8P_Offset_Stride1($f12, $f14, $f20, $f22, $f28, $11, $12)                      \
+  MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18)                       \
+  MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18)                    \
+  MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22)                     \
+  MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26) \
+  MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2])        \
+  MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12)                        \
+  MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2])       \
+  MMI_LoadDiff8P_Offset8($f12, $f14, $f20, $f22, $f28, $11, $12)                             \
+  MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18)                       \
+  MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18)                    \
+  MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22)                     \
+  MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26)
+
+#define MMI_GetSatd8x8_End \
+  MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2])        \
+  MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12)                        \
+  MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2])       \
+  MMI_LoadDiff8P_Offset_Stride1($f12, $f14, $f20, $f22, $f28, $11, $12)                      \
+  MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18)                       \
+  MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18)                    \
+  MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22)                     \
+  MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26) \
+  MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2])        \
+  MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12)                        \
+  MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2])       \
+  MMI_LoadDiff8P($f12, $f14, $f20, $f22, $f28, $11, $12)                                     \
+  MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18)                       \
+  MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18)                    \
+  MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22)                     \
+  MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26)
+
+int32_t WelsSampleSad16x16_mmi (uint8_t* pSample1, int32_t iStride1,
+                                uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSadSum = 0;
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "and        $8, %[pSample2], 0xF                      \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "xor        $f30, $f30, $f30                          \n\t"
+    "bnez       $8, unaligned                             \n\t"
+    "aligned:                                             \n\t"
+    MMI_GetSad4x16_Aligned
+    MMI_GetSad4x16_Aligned
+    MMI_GetSad4x16_Aligned
+    MMI_GetSad4x16_Aligned_End
+    "b          out                                       \n\t"
+
+    "unaligned:                                           \n\t"
+    MMI_GetSad4x16
+    MMI_GetSad4x16
+    MMI_GetSad4x16
+    MMI_GetSad4x16_End
+    "out:                                                 \n\t"
+    "mov.d      $f0, $f30                                 \n\t"
+    "paddh      $f0, $f0, $f28                            \n\t"
+    "dmfc1      %[iSadSum], $f0                           \n\t"
+    : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
+      [pSample2]"+&r"((unsigned char *)pSample2)
+    : [iStride1]"r"((int)iStride1),  [iStride2]"r"((int)iStride2)
+    : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+  return iSadSum;
+}
+
+int32_t WelsSampleSad16x8_mmi (uint8_t* pSample1, int32_t iStride1,
+                               uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSadSum = 0;
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "gsldlc1    $f0, 0x7(%[pSample2])                     \n\t"
+    "gsldlc1    $f2, 0xF(%[pSample2])                     \n\t"
+    "gsldrc1    $f0, 0x0(%[pSample2])                     \n\t"
+    "gsldrc1    $f2, 0x8(%[pSample2])                     \n\t"
+    "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t"
+    PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
+    PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t"
+    "pasubub    $f0, $f0, $f8                             \n\t"
+    "pasubub    $f2, $f2, $f10                            \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t"
+    "gsldlc1    $f6, 0xF(%[pSample2])                     \n\t"
+    "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t"
+    "gsldrc1    $f6, 0x8(%[pSample2])                     \n\t"
+    "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t"
+    "pasubub    $f4, $f4, $f8                             \n\t"
+    "pasubub    $f6, $f6, $f10                            \n\t"
+    "biadd      $f4, $f4                                  \n\t"
+    "biadd      $f6, $f6                                  \n\t"
+    "paddh      $f0, $f0, $f4                             \n\t"
+    "paddh      $f2, $f2, $f6                             \n\t"
+
+    MMI_GetSad2x16
+    MMI_GetSad2x16
+    MMI_GetSad2x16
+
+    "paddh      $f0, $f0, $f2                             \n\t"
+    "dmfc1      %[iSadSum], $f0                           \n\t"
+    : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
+      [pSample2]"+&r"((unsigned char *)pSample2)
+    : [iStride1]"r"((int)iStride1),  [iStride2]"r"((int)iStride2)
+    : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26"
+  );
+  RECOVER_REG;
+  return iSadSum;
+}
+
+int32_t WelsSampleSad8x16_mmi (uint8_t* pSample1, int32_t iStride1,
+                               uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSadSum = 0;
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "xor        $f24, $f24, $f24                          \n\t"
+    "xor        $f26, $f26, $f26                          \n\t"
+    MMI_GetSad8x4
+    MMI_GetSad8x4
+    MMI_GetSad8x4
+    MMI_GetSad8x4_End
+    "paddh      $f0, $f26, $f24                           \n\t"
+    "dmfc1      %[iSadSum], $f0                           \n\t"
+    : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
+      [pSample2]"+&r"((unsigned char *)pSample2)
+    : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+    : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26"
+  );
+  RECOVER_REG;
+  return iSadSum;
+}
+
+int32_t WelsSampleSad4x4_mmi (uint8_t* pSample1, int32_t iStride1,
+                              uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSadSum = 0;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
+    "gsldlc1    $f2, 0x7($8)                              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
+    "gsldrc1    $f2, 0x0($8)                              \n\t"
+    "punpcklwd  $f0, $f0, $f2                             \n\t"
+
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    "gsldlc1    $f6, 0x7(%[pSample2])                     \n\t"
+    "gsldlc1    $f8, 0x7($9)                              \n\t"
+    "gsldrc1    $f6, 0x0(%[pSample2])                     \n\t"
+    "gsldrc1    $f8, 0x0($9)                              \n\t"
+    "punpcklwd  $f6, $f6, $f8                             \n\t"
+    "pasubub    $f0, $f0, $f6                             \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    "gsldlc1    $f2, 0x7(%[pSample1])                     \n\t"
+    "gsldlc1    $f4, 0x7($8)                              \n\t"
+    "gsldrc1    $f2, 0x0(%[pSample1])                     \n\t"
+    "gsldrc1    $f4, 0x0($8)                              \n\t"
+    "punpcklwd  $f2, $f2, $f4                             \n\t"
+
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    "gsldlc1    $f6, 0x7(%[pSample2])                     \n\t"
+    "gsldlc1    $f8, 0x7($9)                              \n\t"
+    "gsldrc1    $f6, 0x0(%[pSample2])                     \n\t"
+    "gsldrc1    $f8, 0x0($9)                              \n\t"
+    "punpcklwd  $f6, $f6, $f8                             \n\t"
+    "pasubub    $f2, $f2, $f6                             \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f0, $f0, $f2                             \n\t"
+
+    "dmfc1      %[iSadSum], $f0                           \n\t"
+    : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
+      [pSample2]"+&r"((unsigned char *)pSample2)
+    : [iStride1]"r"((int)iStride1),  [iStride2]"r"((int)iStride2)
+    : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8"
+  );
+  return iSadSum;
+}
+
+int32_t WelsSampleSad8x8_mmi (uint8_t* pSample1, int32_t iStride1,
+                              uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSadSum = 0;
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    CACHE_SPLIT_CHECK($8, 8, 32)
+    "blez       $8, 1f                                    \n\t"
+    "nop                                                  \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "xor        $f30, $f30, $f30                          \n\t"
+
+    "move       $9, %[pSample2]                           \n\t"
+    "and        $9, $9, 0x7                               \n\t"
+    PTR_SUBU   "%[pSample2], %[pSample2], $9              \n\t"
+    "dli        $8, 0x8                                   \n\t"
+    PTR_SUBU   "$8, $8, $9                                \n\t"
+
+    "dsll       $9, $9, 0x3                               \n\t"
+    "dsll       $8, $8, 0x3                               \n\t"
+    "dmtc1      $9, $f20                                  \n\t"
+    "dmtc1      $8, $f24                                  \n\t"
+    "dli        $9, 0x8                                   \n\t"
+    "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
+    PTR_ADDU   "$9, $9, %[pSample2]                       \n\t"
+    "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
+    PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t"
+    "gsldlc1    $f2, 0x7(%[pSample1])                     \n\t"
+
+    "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t"
+    "gsldlc1    $f8, 0x7($9)                              \n\t"
+    "gsldrc1    $f2, 0x0(%[pSample1])                     \n\t"
+    "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t"
+    "gsldrc1    $f8, 0x0($9)                              \n\t"
+    PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
+    "gsldlc1    $f6, 0x7(%[pSample2])                     \n\t"
+    PTR_ADDU   "$9, $9, %[iStride2]                       \n\t"
+    "gsldrc1    $f6, 0x0(%[pSample2])                     \n\t"
+    "gsldlc1    $f10, 0x7($9)                             \n\t"
+    "dsrl       $f4, $f4, $f20                            \n\t"
+    "gsldrc1    $f10, 0x0($9)                             \n\t"
+    "dsrl       $f6, $f6, $f20                            \n\t"
+    "dsll       $f8, $f8, $f24                            \n\t"
+    "dsll       $f10, $f10, $f24                          \n\t"
+    "or         $f4, $f4, $f8                             \n\t"
+    "or         $f6, $f6, $f10                            \n\t"
+
+    PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t"
+    PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
+    "pasubub    $f0, $f0, $f4                             \n\t"
+    "pasubub    $f2, $f2, $f6                             \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f28, $f28, $f0                           \n\t"
+    "paddh      $f30, $f30, $f2                           \n\t"
+
+    "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
+    PTR_ADDU   "$9, $9, %[iStride2]                       \n\t"
+    "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
+
+    PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t"
+    "gsldlc1    $f2, 0x7(%[pSample1])                     \n\t"
+
+    "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t"
+    "gsldlc1    $f8, 0x7($9)                              \n\t"
+    "gsldrc1    $f2, 0x0(%[pSample1])                     \n\t"
+    "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t"
+    "gsldrc1    $f8, 0x0($9)                              \n\t"
+    PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
+    PTR_ADDU   "$9, $9, %[iStride2]                       \n\t"
+    "gsldlc1    $f6, 0x7(%[pSample2])                     \n\t"
+    "gsldlc1    $f10, 0x7($9)                             \n\t"
+    "gsldrc1    $f6, 0x0(%[pSample2])                     \n\t"
+    "gsldrc1    $f10, 0x0($9)                             \n\t"
+    "dsrl       $f4, $f4, $f20                            \n\t"
+    "dsrl       $f6, $f6, $f20                            \n\t"
+    "dsll       $f8, $f8, $f24                            \n\t"
+    "dsll       $f10, $f10, $f24                          \n\t"
+    "or         $f4, $f4, $f8                             \n\t"
+    "or         $f6, $f6, $f10                            \n\t"
+
+    PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t"
+    PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
+    PTR_ADDU   "$9, $9, %[iStride2]                       \n\t"
+
+    "pasubub    $f0, $f0, $f4                             \n\t"
+    "pasubub    $f2, $f2, $f6                             \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f28, $f28, $f0                           \n\t"
+    "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
+    "paddh      $f30, $f30, $f2                           \n\t"
+    "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
+
+    PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t"
+    "gsldlc1    $f2, 0x7(%[pSample1])                     \n\t"
+
+    "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t"
+    "gsldlc1    $f8, 0x7($9)                              \n\t"
+    "gsldrc1    $f2, 0x0(%[pSample1])                     \n\t"
+    "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t"
+    "gsldrc1    $f8, 0x0($9)                              \n\t"
+    PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
+    PTR_ADDU   "$9, $9, %[iStride2]                       \n\t"
+    "gsldlc1    $f6, 0x7(%[pSample2])                     \n\t"
+    "gsldlc1    $f10, 0x7($9)                             \n\t"
+    "gsldrc1    $f6, 0x0(%[pSample2])                     \n\t"
+    "gsldrc1    $f10, 0x0($9)                             \n\t"
+    "dsrl       $f4, $f4, $f20                            \n\t"
+    "dsrl       $f6, $f6, $f20                            \n\t"
+    "dsll       $f8, $f8, $f24                            \n\t"
+    "dsll       $f10, $f10, $f24                          \n\t"
+    "or         $f4, $f4, $f8                             \n\t"
+    "or         $f6, $f6, $f10                            \n\t"
+
+    PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t"
+    PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
+    PTR_ADDU   "$9, $9, %[iStride2]                       \n\t"
+
+    "pasubub    $f0, $f0, $f4                             \n\t"
+    "pasubub    $f2, $f2, $f6                             \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f28, $f28, $f0                           \n\t"
+    "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
+    "paddh      $f30, $f30, $f2                           \n\t"
+
+    "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
+    PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t"
+    "gsldlc1    $f2, 0x7(%[pSample1])                     \n\t"
+
+    "gsldlc1    $f4, 0x7(%[pSample2])                     \n\t"
+    "gsldlc1    $f8, 0x7($9)                              \n\t"
+    "gsldrc1    $f2, 0x0(%[pSample1])                     \n\t"
+    "gsldrc1    $f4, 0x0(%[pSample2])                     \n\t"
+    "gsldrc1    $f8, 0x0($9)                              \n\t"
+    PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
+    PTR_ADDU   "$9, $9, %[iStride2]                       \n\t"
+    "gsldlc1    $f6, 0x7(%[pSample2])                     \n\t"
+    "gsldlc1    $f10, 0x7($9)                             \n\t"
+    "gsldrc1    $f6, 0x0(%[pSample2])                     \n\t"
+    "gsldrc1    $f10, 0x0($9)                             \n\t"
+    "dsrl       $f4, $f4, $f20                            \n\t"
+    "dsrl       $f6, $f6, $f20                            \n\t"
+    "dsll       $f8, $f8, $f24                            \n\t"
+    "dsll       $f10, $f10, $f24                          \n\t"
+    "or         $f4, $f4, $f8                             \n\t"
+    "or         $f6, $f6, $f10                            \n\t"
+
+    "pasubub    $f0, $f0, $f4                             \n\t"
+    "pasubub    $f2, $f2, $f6                             \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f28, $f28, $f0                           \n\t"
+    "paddh      $f30, $f30, $f2                           \n\t"
+
+    "mov.d      $f0, $f30                                 \n\t"
+    "paddh      $f0, $f0, $f28                            \n\t"
+    "dmfc1      %[iSadSum], $f0                           \n\t"
+    "j          2f                                        \n\t"
+    "nop                                                  \n\t"
+
+    "1:                                                   \n\t"
+    "xor        $f24, $f24, $f24                          \n\t"
+    "xor        $f26, $f26, $f26                          \n\t"
+    MMI_GetSad8x4
+    MMI_GetSad8x4_End
+    "paddh      $f0, $f26, $f24                           \n\t"
+    "dmfc1      %[iSadSum], $f0                           \n\t"
+    "2:                                                   \n\t"
+    : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
+      [pSample2]"+&r"((unsigned char *)pSample2)
+    : [iStride1]"r"((int)iStride1),  [iStride2]"r"((int)iStride2)
+    : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+  return iSadSum;
+}
+
+int32_t WelsSampleSatd4x4_mmi (uint8_t* pSample1, int32_t iStride1,
+                               uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSatdSum = 0;
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
+    "gsldlc1    $f4, 0x7($8)                              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
+    "gsldrc1    $f4, 0x0($8)                              \n\t"
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    "gsldlc1    $f8, 0x7(%[pSample1])                     \n\t"
+    "gsldlc1    $f12, 0x7($8)                             \n\t"
+    "gsldrc1    $f8, 0x0(%[pSample1])                     \n\t"
+    "gsldrc1    $f12, 0x0($8)                             \n\t"
+    "punpcklwd  $f0, $f0, $f8                             \n\t"
+    "punpcklwd  $f4, $f4, $f12                            \n\t"
+
+    PTR_ADDU   "$8, %[pSample2], %[iStride2]              \n\t"
+    "gsldlc1    $f16, 0x7(%[pSample2])                    \n\t"
+    "gsldlc1    $f20, 0x7($8)                             \n\t"
+    "gsldrc1    $f16, 0x0(%[pSample2])                    \n\t"
+    "gsldrc1    $f20, 0x0($8)                             \n\t"
+    PTR_ADDU   "%[pSample2], $8, %[iStride2]              \n\t"
+    PTR_ADDU   "$8, %[pSample2], %[iStride2]              \n\t"
+    "gsldlc1    $f24, 0x7(%[pSample2])                    \n\t"
+    "gsldlc1    $f28, 0x7($8)                             \n\t"
+    "gsldrc1    $f24, 0x0(%[pSample2])                    \n\t"
+    "gsldrc1    $f28, 0x0($8)                             \n\t"
+    "punpcklwd  $f16, $f16, $f24                          \n\t"
+    "punpcklwd  $f20, $f20, $f28                          \n\t"
+
+    "xor        $f24, $f24, $f24                          \n\t"
+    "xor        $f26, $f26, $f26                          \n\t"
+    "punpckhbh  $f2, $f0, $f24                            \n\t"
+    "punpcklbh  $f0, $f0, $f24                            \n\t"
+    "punpckhbh  $f6, $f4, $f24                            \n\t"
+    "punpcklbh  $f4, $f4, $f24                            \n\t"
+    "punpckhbh  $f18, $f16, $f24                          \n\t"
+    "punpcklbh  $f16, $f16, $f24                          \n\t"
+    "punpckhbh  $f22, $f20, $f24                          \n\t"
+    "punpcklbh  $f20, $f20, $f24                          \n\t"
+
+    "psubh      $f0, $f0, $f16                            \n\t"
+    "psubh      $f2, $f2, $f18                            \n\t"
+    "psubh      $f4, $f4, $f20                            \n\t"
+    "psubh      $f6, $f6, $f22                            \n\t"
+
+    "mov.d      $f8, $f0                                  \n\t"
+    "mov.d      $f10, $f2                                 \n\t"
+    "paddh      $f0, $f0, $f4                             \n\t"
+    "paddh      $f2, $f2, $f6                             \n\t"
+    "psubh      $f8, $f8, $f4                             \n\t"
+    "psubh      $f10, $f10, $f6                           \n\t"
+    MMI_XSawp_DQ($f0, $f2, $f8, $f10, $f12, $f14)
+
+    "mov.d      $f16, $f0                                 \n\t"
+    "mov.d      $f18, $f2                                 \n\t"
+    "paddh      $f0, $f0, $f12                            \n\t"
+    "paddh      $f2, $f2, $f14                            \n\t"
+    "psubh      $f16, $f16, $f12                          \n\t"
+    "psubh      $f18, $f18, $f14                          \n\t"
+
+    "mov.d      $f8, $f2                                  \n\t"
+    "punpckhhw  $f2, $f0, $f16                            \n\t"
+    "punpcklhw  $f0, $f0, $f16                            \n\t"
+    "punpcklhw  $f16, $f18, $f8                           \n\t"
+    "punpckhhw  $f18, $f18, $f8                           \n\t"
+
+    MMI_XSawp_WD($f0, $f2, $f16, $f18, $f12, $f14)
+    MMI_XSawp_DQ($f0, $f2, $f12, $f14, $f20, $f22)
+
+    "mov.d      $f28, $f0                                 \n\t"
+    "mov.d      $f30, $f2                                 \n\t"
+    "paddh      $f0, $f0, $f20                            \n\t"
+    "paddh      $f2, $f2, $f22                            \n\t"
+    "psubh      $f28, $f28, $f20                          \n\t"
+    "psubh      $f30, $f30, $f22                          \n\t"
+
+    MMI_XSawp_DQ($f0, $f2, $f28, $f30, $f4, $f6)
+
+    "psubh      $f8, $f0, $f4                             \n\t"
+    "psubh      $f10, $f2, $f6                            \n\t"
+    "paddh      $f0, $f0, $f4                             \n\t"
+    "paddh      $f2, $f2, $f6                             \n\t"
+
+    WELS_AbsH($f0, $f2, $f0, $f2, $f12, $f14)
+    "paddush    $f24, $f24, $f0                           \n\t"
+    "paddush    $f26, $f26, $f2                           \n\t"
+    WELS_AbsH($f8, $f10, $f8, $f10, $f16, $f18)
+    "paddush    $f24, $f24, $f8                           \n\t"
+    "paddush    $f26, $f26, $f10                          \n\t"
+    MMI_SumWHorizon1($f24, $f26, $f16, $f18, $f28, $f30, $8)
+
+    "dmfc1      $8, $f24                                  \n\t"
+    "dli        $9, 0xffff                                \n\t"
+    "and        $8, $8, $9                                \n\t"
+    "dsrl       %[iSatdSum], $8, 0x1                      \n\t"
+    : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
+      [pSample2]"+&r"((unsigned char *)pSample2)
+    : [iStride1]"r"((int)iStride1),  [iStride2]"r"((int)iStride2)
+    : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+  return iSatdSum;
+}
+
+int32_t WelsSampleSatd8x8_mmi (uint8_t* pSample1, int32_t iStride1,
+                               uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSatdSum = 0;
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "xor        $f24, $f24, $f24                          \n\t"
+    "xor        $f26, $f26, $f26                          \n\t"
+    "dli        $8, 0x1                                   \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "dmtc1      $8, $f30                                  \n\t"
+    MMI_GetSatd8x8_End
+    "psrlh      $f24, $f24, $f30                          \n\t"
+    "dli        $8, 0x4e                                  \n\t"
+    "psrlh      $f26, $f26, $f30                          \n\t"
+    "dmtc1      $8, $f30                                  \n\t"
+    MMI_SumWHorizon($f24, $f26, $f16, $f18, $f28, $f30)
+    "mfc1       %[iSatdSum], $f24                         \n\t"
+    : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
+      [pSample2]"+&r"((unsigned char *)pSample2)
+    : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+    : "memory", "$8", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
+      "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+  return iSatdSum;
+}
+
+int32_t WelsSampleSatd8x16_mmi (uint8_t* pSample1, int32_t iStride1,
+                                uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSatdSum = 0;
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "xor        $f24, $f24, $f24                          \n\t"
+    "xor        $f26, $f26, $f26                          \n\t"
+    "dli        $8, 0x1                                   \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "dmtc1      $8, $f30                                  \n\t"
+    MMI_GetSatd8x8
+    MMI_GetSatd8x8_End
+    "psrlh      $f24, $f24, $f30                          \n\t"
+    "dli        $8, 0x4e                                  \n\t"
+    "psrlh      $f26, $f26, $f30                          \n\t"
+    "dmtc1      $8, $f30                                  \n\t"
+    MMI_SumWHorizon($f24, $f26, $f16, $f18, $f28, $f30)
+    "mfc1       %[iSatdSum], $f24                         \n\t"
+    : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
+      [pSample2]"+&r"((unsigned char *)pSample2)
+    : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+    : "memory", "$8", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
+      "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+  return iSatdSum;
+}
+
+int32_t WelsSampleSatd16x8_mmi (uint8_t* pSample1, int32_t iStride1,
+                                uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSatdSum = 0;
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "xor        $f24, $f24, $f24                          \n\t"
+    "xor        $f26, $f26, $f26                          \n\t"
+    "dli        $8, 0x1                                   \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "dmtc1      $8, $f30                                  \n\t"
+    "move       $9, %[pSample1]                           \n\t"
+    "move       $10, %[pSample2]                          \n\t"
+    MMI_GetSatd8x8_Offset8
+
+    MMI_GetSatd8x8_End
+    "psrlh      $f24, $f24, $f30                          \n\t"
+    "dli        $8, 0x4e                                  \n\t"
+    "psrlh      $f26, $f26, $f30                          \n\t"
+    "dmtc1      $8, $f30                                  \n\t"
+    MMI_SumWHorizon($f24, $f26, $f16, $f18, $f28, $f30)
+    "mfc1       %[iSatdSum], $f24                         \n\t"
+    : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
+      [pSample2]"+&r"((unsigned char *)pSample2)
+    : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6",
+      "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
+      "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+  return iSatdSum;
+}
+
+int32_t WelsSampleSatd16x16_mmi (uint8_t* pSample1, int32_t iStride1,
+                                 uint8_t* pSample2, int32_t iStride2) {
+  int32_t iSatdSum = 0;
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "xor        $f24, $f24, $f24                          \n\t"
+    "xor        $f26, $f26, $f26                          \n\t"
+    "dli        $8, 0x1                                   \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "dmtc1      $8, $f30                                  \n\t"
+    "move       $9, %[pSample1]                           \n\t"
+    "move       $10, %[pSample2]                          \n\t"
+
+    MMI_GetSatd8x8
+    MMI_GetSatd8x8_Offset8
+
+    MMI_GetSatd8x8
+    MMI_GetSatd8x8_End
+
+    "dli        $8, 0x4e                                  \n\t"
+    "psrlh      $f24, $f24, $f30                          \n\t"
+    "dmtc1      $8, $f0                                   \n\t"
+    "psrlh      $f26, $f26, $f30                          \n\t"
+    MMI_SumWHorizon($f24, $f26, $f16, $f18, $f28, $f0)
+    "mfc1       %[iSatdSum], $f24                         \n\t"
+    : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
+      [pSample2]"+&r"((unsigned char *)pSample2)
+    : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+    : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6",
+      "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
+      "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+  return iSatdSum;
+}
+
+void WelsSampleSadFour16x16_mmi (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2,
+                                 int32_t iStride2, int32_t* pSad) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "xor        $f16, $f16, $f16                          \n\t"
+    "xor        $f18, $f18, $f18                          \n\t"
+    "xor        $f20, $f20, $f20                          \n\t"
+    "xor        $f22, $f22, $f22                          \n\t"
+    PTR_SUBU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
+    "xor        $f24, $f24, $f24                          \n\t"
+    "xor        $f26, $f26, $f26                          \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "xor        $f30, $f30, $f30                          \n\t"
+    "gslqc1     $f2, $f0, 0x0(%[pSample1])                \n\t"
+    "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
+    "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
+    "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t"
+    PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f16, $f16, $f12                          \n\t"
+    "paddh      $f18, $f18, $f14                          \n\t"
+
+    "gslqc1     $f6, $f4, 0x0(%[pSample1])                \n\t"
+    "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
+    "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
+    "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
+    "pasubub    $f12, $f12, $f4                           \n\t"
+    "pasubub    $f14, $f14, $f6                           \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f16, $f16, $f12                          \n\t"
+    "paddh      $f18, $f18, $f14                          \n\t"
+
+    "gsldlc1    $f8, 0x6(%[pSample2])                     \n\t"
+    "gsldlc1    $f10, 0xE(%[pSample2])                    \n\t"
+    "gsldrc1    $f8, -0x1(%[pSample2])                    \n\t"
+    "gsldrc1    $f10, 0x7(%[pSample2])                    \n\t"
+    "pasubub    $f8, $f8, $f0                             \n\t"
+    "pasubub    $f10, $f10, $f2                           \n\t"
+    "biadd      $f8, $f8                                  \n\t"
+    "biadd      $f10, $f10                                \n\t"
+    "paddh      $f24, $f24, $f8                           \n\t"
+    "paddh      $f26, $f26, $f10                          \n\t"
+
+    "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
+    "gsldlc1    $f14, 0x10(%[pSample2])                   \n\t"
+    "gsldrc1    $f14, 0x9(%[pSample2])                    \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f28, $f28, $f12                          \n\t"
+    "paddh      $f30, $f30, $f14                          \n\t"
+
+    "gslqc1     $f10, $f8, 0x0($8)                        \n\t"
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0xF($9)                             \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x8($9)                             \n\t"
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $9)
+    "gslqc1     $f2, $f0, 0x0(%[pSample1])                \n\t"
+    "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
+    "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
+    "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, %[pSample2])
+    "gslqc1     $f6, $f4, 0x0($8)                         \n\t"
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0xF($9)                             \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x8($9)                             \n\t"
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, $9)
+    "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t"
+    "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
+    "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
+    "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, %[pSample2])
+    "gslqc1     $f2, $f0, 0x0($8)                         \n\t"
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0xF($9)                             \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x8($9)                             \n\t"
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, $9)
+    "gslqc1     $f6, $f4, 0x0(%[pSample1])                \n\t"
+    "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
+    "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
+    "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, %[pSample2])
+
+    "gslqc1     $f10, $f8, 0x0($8)                        \n\t"
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0xF($9)                             \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x8($9)                             \n\t"
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $9)
+    "gslqc1     $f2, $f0, 0x0(%[pSample1])                \n\t"
+    "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
+    "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
+    "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, %[pSample2])
+    "gslqc1     $f6, $f4, 0x0($8)                         \n\t"
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0xF($9)                             \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x8($9)                             \n\t"
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, $9)
+    "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t"
+    "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
+    "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
+    "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, %[pSample2])
+    "gslqc1     $f2, $f0, 0x0($8)                         \n\t"
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0xF($9)                             \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x8($9)                             \n\t"
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, $9)
+    "gslqc1     $f6, $f4, 0x0(%[pSample1])                \n\t"
+    "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
+    "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
+    "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, %[pSample2])
+
+    "gslqc1     $f10, $f8, 0x0($8)                        \n\t"
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0xF($9)                             \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x8($9)                             \n\t"
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $9)
+    "gslqc1     $f2, $f0, 0x0(%[pSample1])                \n\t"
+    "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
+    "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
+    "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, %[pSample2])
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0xF($9)                             \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x8($9)                             \n\t"
+    "pasubub    $f8, $f8, $f12                            \n\t"
+    "pasubub    $f10, $f10, $f14                          \n\t"
+    "biadd      $f8, $f8                                  \n\t"
+    "biadd      $f10, $f10                                \n\t"
+    "paddh      $f20, $f20, $f8                           \n\t"
+    "paddh      $f22, $f22, $f10                          \n\t"
+
+    "gsldlc1    $f8, 0x6($9)                              \n\t"
+    "gsldlc1    $f10, 0xE($9)                             \n\t"
+    "gsldrc1    $f8, -0x1($9)                             \n\t"
+    "gsldrc1    $f10, 0x7($9)                             \n\t"
+    "pasubub    $f8, $f8, $f0                             \n\t"
+    "pasubub    $f10, $f10, $f2                           \n\t"
+    "biadd      $f8, $f8                                  \n\t"
+    "biadd      $f10, $f10                                \n\t"
+    "paddh      $f24, $f24, $f8                           \n\t"
+    "paddh      $f26, $f26, $f10                          \n\t"
+
+    "gsldlc1    $f12, 0x8($9)                             \n\t"
+    "gsldlc1    $f14, 0x10($9)                            \n\t"
+    "gsldrc1    $f12, 0x1($9)                             \n\t"
+    "gsldrc1    $f14, 0x9($9)                             \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f28, $f28, $f12                          \n\t"
+    "paddh      $f30, $f30, $f14                          \n\t"
+
+    "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
+    "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
+    "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
+    "pasubub    $f0, $f0, $f12                            \n\t"
+    "pasubub    $f2, $f2, $f14                            \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f20, $f20, $f0                           \n\t"
+    "paddh      $f22, $f22, $f2                           \n\t"
+
+    "paddh      $f16, $f16, $f18                          \n\t"
+    "paddh      $f20, $f20, $f22                          \n\t"
+    "paddh      $f24, $f24, $f26                          \n\t"
+    "paddh      $f28, $f28, $f30                          \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "punpcklwd  $f24, $f24, $f28                          \n\t"
+    "gssqc1     $f24, $f16, 0x0(%[pSad])                  \n\t"
+    : [pSample1]"+&r"((unsigned char *)pSample1),
+      [pSample2]"+&r"((unsigned char *)pSample2)
+    : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2),
+      [pSad]"r"((int *)pSad)
+    : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void WelsSampleSadFour16x8_mmi (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2,
+                                int32_t iStride2, int32_t* pSad) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "xor        $f16, $f16, $f16                          \n\t"
+    "xor        $f18, $f18, $f18                          \n\t"
+    "xor        $f20, $f20, $f20                          \n\t"
+    "xor        $f22, $f22, $f22                          \n\t"
+    "gslqc1     $f2, $f0, 0x0(%[pSample1])                \n\t"
+    PTR_SUBU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
+    "xor        $f24, $f24, $f24                          \n\t"
+    "xor        $f26, $f26, $f26                          \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "xor        $f30, $f30, $f30                          \n\t"
+    "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
+    "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
+    "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "%[pSample1], %[pSample1], %[iStride1]     \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    PTR_ADDU   "%[pSample2], %[pSample2], %[iStride2]     \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f16, $f16, $f12                          \n\t"
+    "paddh      $f18, $f18, $f14                          \n\t"
+
+    "gslqc1     $f6, $f4, 0x0(%[pSample1])                \n\t"
+    "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
+    "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
+    "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
+    "pasubub    $f12, $f12, $f4                           \n\t"
+    "pasubub    $f14, $f14, $f6                           \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f16, $f16, $f12                          \n\t"
+    "paddh      $f18, $f18, $f14                          \n\t"
+
+    "gsldlc1    $f8, 0x6(%[pSample2])                     \n\t"
+    "gsldlc1    $f10, 0xE(%[pSample2])                    \n\t"
+    "gsldrc1    $f8, -0x1(%[pSample2])                    \n\t"
+    "gsldrc1    $f10, 0x7(%[pSample2])                    \n\t"
+    "pasubub    $f8, $f8, $f0                             \n\t"
+    "pasubub    $f10, $f10, $f2                           \n\t"
+    "biadd      $f8, $f8                                  \n\t"
+    "biadd      $f10, $f10                                \n\t"
+    "paddh      $f24, $f24, $f8                           \n\t"
+    "paddh      $f26, $f26, $f10                          \n\t"
+
+    "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
+    "gsldlc1    $f14, 0x10(%[pSample2])                   \n\t"
+    "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
+    "gsldrc1    $f14, 0x9(%[pSample2])                    \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f28, $f28, $f12                          \n\t"
+    "paddh      $f30, $f30, $f14                          \n\t"
+
+    "gslqc1     $f10, $f8, 0x0($8)                        \n\t"
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0xF($9)                             \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x8($9)                             \n\t"
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $9)
+    "gslqc1     $f2, $f0, 0x0(%[pSample1])                \n\t"
+    "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
+    "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
+    "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, %[pSample2])
+    "gslqc1     $f6, $f4, 0x0($8)                         \n\t"
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0xF($9)                             \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x8($9)                             \n\t"
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, $9)
+    "gslqc1     $f10, $f8, 0x0(%[pSample1])               \n\t"
+    "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
+    "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
+    "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, %[pSample2])
+    "gslqc1     $f2, $f0, 0x0($8)                         \n\t"
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0xF($9)                             \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x8($9)                             \n\t"
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, $9)
+    "gslqc1     $f6, $f4, 0x0(%[pSample1])                \n\t"
+    "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
+    "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
+    "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, %[pSample2])
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0xF($9)                             \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x8($9)                             \n\t"
+    "pasubub    $f0, $f0, $f12                            \n\t"
+    "pasubub    $f2, $f2, $f14                            \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f20, $f20, $f0                           \n\t"
+    "paddh      $f22, $f22, $f2                           \n\t"
+
+    "gsldlc1    $f0, 0x6($9)                              \n\t"
+    "gsldlc1    $f2, 0xE($9)                              \n\t"
+    "gsldrc1    $f0, -0x1($9)                             \n\t"
+    "gsldrc1    $f2, 0x7($9)                              \n\t"
+    "pasubub    $f0, $f0, $f4                             \n\t"
+    "pasubub    $f2, $f2, $f6                             \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f24, $f24, $f0                           \n\t"
+    "paddh      $f26, $f26, $f2                           \n\t"
+
+    "gsldlc1    $f12, 0x8($9)                             \n\t"
+    "gsldlc1    $f14, 0x10($9)                            \n\t"
+    "gsldrc1    $f12, 0x1($9)                             \n\t"
+    "gsldrc1    $f14, 0x9($9)                             \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    "pasubub    $f12, $f12, $f4                           \n\t"
+    "pasubub    $f14, $f14, $f6                           \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f28, $f28, $f12                          \n\t"
+    "paddh      $f30, $f30, $f14                          \n\t"
+
+    "gsldlc1    $f12, 0x7(%[pSample2])                    \n\t"
+    "gsldlc1    $f14, 0xF(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0(%[pSample2])                    \n\t"
+    "gsldrc1    $f14, 0x8(%[pSample2])                    \n\t"
+    "pasubub    $f4, $f4, $f12                            \n\t"
+    "pasubub    $f6, $f6, $f14                            \n\t"
+    "biadd      $f4, $f4                                  \n\t"
+    "biadd      $f6, $f6                                  \n\t"
+    "paddh      $f20, $f20, $f4                           \n\t"
+    "paddh      $f22, $f22, $f6                           \n\t"
+
+    "paddh      $f16, $f16, $f18                          \n\t"
+    "paddh      $f20, $f20, $f22                          \n\t"
+    "paddh      $f24, $f24, $f26                          \n\t"
+    "paddh      $f28, $f28, $f30                          \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "punpcklwd  $f24, $f24, $f28                          \n\t"
+    "gssqc1     $f24, $f16, 0x0(%[pSad])                  \n\t"
+    : [pSample1]"+&r"((unsigned char *)pSample1),
+      [pSample2]"+&r"((unsigned char *)pSample2)
+    : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2),
+      [pSad]"r"((int *)pSad)
+    : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28","$f30"
+  );
+  RECOVER_REG;
+}
+
+void WelsSampleSadFour8x16_mmi (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2,
+                                int32_t iStride2, int32_t* pSad) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "xor        $f16, $f16, $f16                          \n\t"
+    "xor        $f18, $f18, $f18                          \n\t"
+    "xor        $f20, $f20, $f20                          \n\t"
+    "xor        $f22, $f22, $f22                          \n\t"
+    "xor        $f24, $f24, $f24                          \n\t"
+    "xor        $f26, $f26, $f26                          \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "xor        $f30, $f30, $f30                          \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    PTR_SUBU   "$9, %[pSample2], %[iStride2]              \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
+    "gsldlc1    $f2, 0x7($8)                              \n\t"
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
+    "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
+    "gsldrc1    $f2, 0x0($8)                              \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f16, $f16, $f12                          \n\t"
+    "paddh      $f18, $f18, $f14                          \n\t"
+
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
+    "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
+    "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
+
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    "gsldlc1    $f6, 0x6($9)                              \n\t"
+    "gsldlc1    $f14, 0x8($9)                             \n\t"
+    "gsldrc1    $f6, -0x1($9)                             \n\t"
+    "gsldrc1    $f14, 0x1($9)                             \n\t"
+    "pasubub    $f4, $f4, $f0                             \n\t"
+    "pasubub    $f6, $f6, $f2                             \n\t"
+    "biadd      $f4, $f4                                  \n\t"
+    "biadd      $f6, $f6                                  \n\t"
+    "paddh      $f24, $f24, $f4                           \n\t"
+    "paddh      $f26, $f26, $f6                           \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f28, $f28, $f12                          \n\t"
+    "paddh      $f30, $f30, $f14                          \n\t"
+
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
+    "pasubub    $f0, $f0, $f12                            \n\t"
+    "pasubub    $f2, $f2, $f14                            \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f20, $f20, $f0                           \n\t"
+    "paddh      $f22, $f22, $f2                           \n\t"
+
+    "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
+    "gsldlc1    $f2, 0x7($8)                              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
+    "gsldrc1    $f2, 0x0($8)                              \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f16, $f16, $f12                          \n\t"
+    "paddh      $f18, $f18, $f14                          \n\t"
+
+    "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
+    "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
+
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    "gsldlc1    $f6, 0x6($9)                              \n\t"
+    "gsldlc1    $f14, 0x8($9)                             \n\t"
+    "gsldrc1    $f6, -0x1($9)                             \n\t"
+    "gsldrc1    $f14, 0x1($9)                             \n\t"
+
+    "pasubub    $f4, $f4, $f0                             \n\t"
+    "pasubub    $f6, $f6, $f2                             \n\t"
+    "biadd      $f4, $f4                                  \n\t"
+    "biadd      $f6, $f6                                  \n\t"
+    "paddh      $f24, $f24, $f4                           \n\t"
+    "paddh      $f26, $f26, $f6                           \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f28, $f28, $f12                          \n\t"
+    "paddh      $f30, $f30, $f14                          \n\t"
+
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
+    "pasubub    $f0, $f0, $f12                            \n\t"
+    "pasubub    $f2, $f2, $f14                            \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f20, $f20, $f0                           \n\t"
+    "paddh      $f22, $f22, $f2                           \n\t"
+
+    "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
+    "gsldlc1    $f2, 0x7($8)                              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
+    "gsldrc1    $f2, 0x0($8)                              \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f16, $f16, $f12                          \n\t"
+    "paddh      $f18, $f18, $f14                          \n\t"
+
+    "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
+    "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
+
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    "gsldlc1    $f6, 0x6($9)                              \n\t"
+    "gsldlc1    $f14, 0x8($9)                             \n\t"
+    "gsldrc1    $f6, -0x1($9)                             \n\t"
+    "gsldrc1    $f14, 0x1($9)                             \n\t"
+
+    "pasubub    $f4, $f4, $f0                             \n\t"
+    "pasubub    $f6, $f6, $f2                             \n\t"
+    "biadd      $f4, $f4                                  \n\t"
+    "biadd      $f6, $f6                                  \n\t"
+    "paddh      $f24, $f24, $f4                           \n\t"
+    "paddh      $f26, $f26, $f6                           \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f28, $f28, $f12                          \n\t"
+    "paddh      $f30, $f30, $f14                          \n\t"
+
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
+    "pasubub    $f0, $f0, $f12                            \n\t"
+    "pasubub    $f2, $f2, $f14                            \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f20, $f20, $f0                           \n\t"
+    "paddh      $f22, $f22, $f2                           \n\t"
+
+    "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
+    "gsldlc1    $f2, 0x7($8)                              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
+    "gsldrc1    $f2, 0x0($8)                              \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f16, $f16, $f12                          \n\t"
+    "paddh      $f18, $f18, $f14                          \n\t"
+
+    "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
+    "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
+
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    "gsldlc1    $f6, 0x6($9)                              \n\t"
+    "gsldlc1    $f14, 0x8($9)                             \n\t"
+    "gsldrc1    $f6, -0x1($9)                             \n\t"
+    "gsldrc1    $f14, 0x1($9)                             \n\t"
+    "pasubub    $f4, $f4, $f0                             \n\t"
+    "pasubub    $f6, $f6, $f2                             \n\t"
+    "biadd      $f4, $f4                                  \n\t"
+    "biadd      $f6, $f6                                  \n\t"
+    "paddh      $f24, $f24, $f4                           \n\t"
+    "paddh      $f26, $f26, $f6                           \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f28, $f28, $f12                          \n\t"
+    "paddh      $f30, $f30, $f14                          \n\t"
+
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
+    "pasubub    $f0, $f0, $f12                            \n\t"
+    "pasubub    $f2, $f2, $f14                            \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f20, $f20, $f0                           \n\t"
+    "paddh      $f22, $f22, $f2                           \n\t"
+
+    "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
+    "gsldlc1    $f2, 0x7($8)                              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
+    "gsldrc1    $f2, 0x0($8)                              \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f16, $f16, $f12                          \n\t"
+    "paddh      $f18, $f18, $f14                          \n\t"
+
+    "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
+    "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
+
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    "gsldlc1    $f6, 0x6($9)                              \n\t"
+    "gsldlc1    $f14, 0x8($9)                             \n\t"
+    "gsldrc1    $f6, -0x1($9)                             \n\t"
+    "gsldrc1    $f14, 0x1($9)                             \n\t"
+
+    "pasubub    $f4, $f4, $f0                             \n\t"
+    "pasubub    $f6, $f6, $f2                             \n\t"
+    "biadd      $f4, $f4                                  \n\t"
+    "biadd      $f6, $f6                                  \n\t"
+    "paddh      $f24, $f24, $f4                           \n\t"
+    "paddh      $f26, $f26, $f6                           \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f28, $f28, $f12                          \n\t"
+    "paddh      $f30, $f30, $f14                          \n\t"
+
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
+    "pasubub    $f0, $f0, $f12                            \n\t"
+    "pasubub    $f2, $f2, $f14                            \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f20, $f20, $f0                           \n\t"
+    "paddh      $f22, $f22, $f2                           \n\t"
+
+    "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
+    "gsldlc1    $f2, 0x7($8)                              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
+    "gsldrc1    $f2, 0x0($8)                              \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f16, $f16, $f12                          \n\t"
+    "paddh      $f18, $f18, $f14                          \n\t"
+
+    "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
+    "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
+
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    "gsldlc1    $f6, 0x6($9)                              \n\t"
+    "gsldlc1    $f14, 0x8($9)                             \n\t"
+    "gsldrc1    $f6, -0x1($9)                             \n\t"
+    "gsldrc1    $f14, 0x1($9)                             \n\t"
+
+    "pasubub    $f4, $f4, $f0                             \n\t"
+    "pasubub    $f6, $f6, $f2                             \n\t"
+    "biadd      $f4, $f4                                  \n\t"
+    "biadd      $f6, $f6                                  \n\t"
+    "paddh      $f24, $f24, $f4                           \n\t"
+    "paddh      $f26, $f26, $f6                           \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f28, $f28, $f12                          \n\t"
+    "paddh      $f30, $f30, $f14                          \n\t"
+
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
+    "pasubub    $f0, $f0, $f12                            \n\t"
+    "pasubub    $f2, $f2, $f14                            \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f20, $f20, $f0                           \n\t"
+    "paddh      $f22, $f22, $f2                           \n\t"
+
+    "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
+    "gsldlc1    $f2, 0x7($8)                              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
+    "gsldrc1    $f2, 0x0($8)                              \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f16, $f16, $f12                          \n\t"
+    "paddh      $f18, $f18, $f14                          \n\t"
+
+    "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
+    "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
+
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    "gsldlc1    $f6, 0x6($9)                              \n\t"
+    "gsldlc1    $f14, 0x8($9)                             \n\t"
+    "gsldrc1    $f6, -0x1($9)                             \n\t"
+    "gsldrc1    $f14, 0x1($9)                             \n\t"
+
+    "pasubub    $f4, $f4, $f0                             \n\t"
+    "pasubub    $f6, $f6, $f2                             \n\t"
+    "biadd      $f4, $f4                                  \n\t"
+    "biadd      $f6, $f6                                  \n\t"
+    "paddh      $f24, $f24, $f4                           \n\t"
+    "paddh      $f26, $f26, $f6                           \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f28, $f28, $f12                          \n\t"
+    "paddh      $f30, $f30, $f14                          \n\t"
+
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
+    "pasubub    $f0, $f0, $f12                            \n\t"
+    "pasubub    $f2, $f2, $f14                            \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f20, $f20, $f0                           \n\t"
+    "paddh      $f22, $f22, $f2                           \n\t"
+
+    "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
+    "gsldlc1    $f2, 0x7($8)                              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
+    "gsldrc1    $f2, 0x0($8)                              \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f16, $f16, $f12                          \n\t"
+    "paddh      $f18, $f18, $f14                          \n\t"
+
+    "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
+    "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
+
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    "gsldlc1    $f6, 0x6($9)                              \n\t"
+    "gsldlc1    $f14, 0x8($9)                             \n\t"
+    "gsldrc1    $f6, -0x1($9)                             \n\t"
+    "gsldrc1    $f14, 0x1($9)                             \n\t"
+
+    "pasubub    $f4, $f4, $f0                             \n\t"
+    "pasubub    $f6, $f6, $f2                             \n\t"
+    "biadd      $f4, $f4                                  \n\t"
+    "biadd      $f6, $f6                                  \n\t"
+    "paddh      $f24, $f24, $f4                           \n\t"
+    "paddh      $f26, $f26, $f6                           \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f28, $f28, $f12                          \n\t"
+    "paddh      $f30, $f30, $f14                          \n\t"
+
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
+    "pasubub    $f0, $f0, $f12                            \n\t"
+    "pasubub    $f2, $f2, $f14                            \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f20, $f20, $f0                           \n\t"
+    "paddh      $f22, $f22, $f2                           \n\t"
+
+    "paddh      $f16, $f16, $f18                          \n\t"
+    "paddh      $f20, $f20, $f22                          \n\t"
+    "paddh      $f24, $f24, $f26                          \n\t"
+    "paddh      $f28, $f28, $f30                          \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "punpcklwd  $f24, $f24, $f28                          \n\t"
+    "gssqc1     $f24, $f16, 0x0(%[pSad])                  \n\t"
+    : [pSample1]"+&r"((unsigned char *)pSample1),
+      [pSample2]"+&r"((unsigned char *)pSample2)
+    : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2),
+      [pSad]"r"((int *)pSad)
+    : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+  );
+  RECOVER_REG;
+}
+
+void WelsSampleSadFour8x8_mmi (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2,
+                               int32_t iStride2, int32_t* pSad) {
+  BACKUP_REG;
+  __asm__ volatile (
+    ".set       arch=loongson3a                           \n\t"
+    "xor        $f16, $f16, $f16                          \n\t"
+    "xor        $f18, $f18, $f18                          \n\t"
+    "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
+    "xor        $f20, $f20, $f20                          \n\t"
+    "xor        $f22, $f22, $f22                          \n\t"
+    "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
+    "xor        $f24, $f24, $f24                          \n\t"
+    "xor        $f26, $f26, $f26                          \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    PTR_SUBU   "$9, %[pSample2], %[iStride2]              \n\t"
+    "xor        $f28, $f28, $f28                          \n\t"
+    "xor        $f30, $f30, $f30                          \n\t"
+    "gsldlc1    $f2, 0x7($8)                              \n\t"
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    "gsldrc1    $f2, 0x0($8)                              \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
+    "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f16, $f16, $f12                          \n\t"
+    "paddh      $f18, $f18, $f14                          \n\t"
+
+    "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
+    "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
+
+    "gsldlc1    $f6, 0x6($9)                              \n\t"
+    "gsldlc1    $f14, 0x8($9)                             \n\t"
+    "gsldrc1    $f6, -0x1($9)                             \n\t"
+    "gsldrc1    $f14, 0x1($9)                             \n\t"
+    "pasubub    $f4, $f4, $f0                             \n\t"
+    "pasubub    $f6, $f6, $f2                             \n\t"
+    "biadd      $f4, $f4                                  \n\t"
+    "biadd      $f6, $f6                                  \n\t"
+    "paddh      $f24, $f24, $f4                           \n\t"
+    "paddh      $f26, $f26, $f6                           \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f28, $f28, $f12                          \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    "paddh      $f30, $f30, $f14                          \n\t"
+
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
+    "pasubub    $f0, $f0, $f12                            \n\t"
+    "pasubub    $f2, $f2, $f14                            \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f20, $f20, $f0                           \n\t"
+    "paddh      $f22, $f22, $f2                           \n\t"
+
+    "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
+    "gsldlc1    $f2, 0x7($8)                              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
+    "gsldrc1    $f2, 0x0($8)                              \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f16, $f16, $f12                          \n\t"
+    "paddh      $f18, $f18, $f14                          \n\t"
+
+    "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
+    "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
+
+    "gsldlc1    $f6, 0x6($9)                              \n\t"
+    "gsldlc1    $f14, 0x8($9)                             \n\t"
+    "gsldrc1    $f6, -0x1($9)                             \n\t"
+    "gsldrc1    $f14, 0x1($9)                             \n\t"
+
+    "pasubub    $f4, $f4, $f0                             \n\t"
+    "pasubub    $f6, $f6, $f2                             \n\t"
+    "biadd      $f4, $f4                                  \n\t"
+    "biadd      $f6, $f6                                  \n\t"
+    "paddh      $f24, $f24, $f4                           \n\t"
+    "paddh      $f26, $f26, $f6                           \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f28, $f28, $f12                          \n\t"
+    "paddh      $f30, $f30, $f14                          \n\t"
+
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
+    "pasubub    $f0, $f0, $f12                            \n\t"
+    "pasubub    $f2, $f2, $f14                            \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f20, $f20, $f0                           \n\t"
+    "paddh      $f22, $f22, $f2                           \n\t"
+
+    "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
+    "gsldlc1    $f2, 0x7($8)                              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
+    "gsldrc1    $f2, 0x0($8)                              \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f16, $f16, $f12                          \n\t"
+    "paddh      $f18, $f18, $f14                          \n\t"
+
+    "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
+    "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
+
+    "gsldlc1    $f6, 0x6($9)                              \n\t"
+    "gsldlc1    $f14, 0x8($9)                             \n\t"
+    "gsldrc1    $f6, -0x1($9)                             \n\t"
+    "gsldrc1    $f14, 0x1($9)                             \n\t"
+
+    "pasubub    $f4, $f4, $f0                             \n\t"
+    "pasubub    $f6, $f6, $f2                             \n\t"
+    "biadd      $f4, $f4                                  \n\t"
+    "biadd      $f6, $f6                                  \n\t"
+    "paddh      $f24, $f24, $f4                           \n\t"
+    "paddh      $f26, $f26, $f6                           \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f28, $f28, $f12                          \n\t"
+    "paddh      $f30, $f30, $f14                          \n\t"
+
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
+    "pasubub    $f0, $f0, $f12                            \n\t"
+    "pasubub    $f2, $f2, $f14                            \n\t"
+    PTR_ADDU   "$8, %[pSample1], %[iStride1]              \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f20, $f20, $f0                           \n\t"
+    "paddh      $f22, $f22, $f2                           \n\t"
+
+    "gsldlc1    $f0, 0x7(%[pSample1])                     \n\t"
+    "gsldlc1    $f2, 0x7($8)                              \n\t"
+    "gsldrc1    $f0, 0x0(%[pSample1])                     \n\t"
+    "gsldrc1    $f2, 0x0($8)                              \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f16, $f16, $f12                          \n\t"
+    "paddh      $f18, $f18, $f14                          \n\t"
+
+    "gsldlc1    $f4, 0x6(%[pSample2])                     \n\t"
+    "gsldlc1    $f12, 0x8(%[pSample2])                    \n\t"
+    PTR_ADDU   "$9, %[pSample2], %[iStride2]              \n\t"
+    PTR_ADDU   "%[pSample1], $8, %[iStride1]              \n\t"
+    "gsldrc1    $f4, -0x1(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x1(%[pSample2])                    \n\t"
+
+    "gsldlc1    $f6, 0x6($9)                              \n\t"
+    "gsldlc1    $f14, 0x8($9)                             \n\t"
+    "gsldrc1    $f6, -0x1($9)                             \n\t"
+    "gsldrc1    $f14, 0x1($9)                             \n\t"
+
+    "pasubub    $f4, $f4, $f0                             \n\t"
+    "pasubub    $f6, $f6, $f2                             \n\t"
+    "biadd      $f4, $f4                                  \n\t"
+    "biadd      $f6, $f6                                  \n\t"
+    "paddh      $f24, $f24, $f4                           \n\t"
+    "paddh      $f26, $f26, $f6                           \n\t"
+    "pasubub    $f12, $f12, $f0                           \n\t"
+    "pasubub    $f14, $f14, $f2                           \n\t"
+    PTR_ADDU   "%[pSample2], $9, %[iStride2]              \n\t"
+    "biadd      $f12, $f12                                \n\t"
+    "biadd      $f14, $f14                                \n\t"
+    "paddh      $f28, $f28, $f12                          \n\t"
+    "paddh      $f30, $f30, $f14                          \n\t"
+
+    "gsldlc1    $f12, 0x7($9)                             \n\t"
+    "gsldlc1    $f14, 0x7(%[pSample2])                    \n\t"
+    "gsldrc1    $f12, 0x0($9)                             \n\t"
+    "gsldrc1    $f14, 0x0(%[pSample2])                    \n\t"
+    "pasubub    $f0, $f0, $f12                            \n\t"
+    "pasubub    $f2, $f2, $f14                            \n\t"
+    "biadd      $f0, $f0                                  \n\t"
+    "biadd      $f2, $f2                                  \n\t"
+    "paddh      $f20, $f20, $f0                           \n\t"
+    "paddh      $f22, $f22, $f2                           \n\t"
+
+    "paddh      $f16, $f16, $f18                          \n\t"
+    "paddh      $f20, $f20, $f22                          \n\t"
+    "paddh      $f24, $f24, $f26                          \n\t"
+    "paddh      $f28, $f28, $f30                          \n\t"
+    "punpcklwd  $f16, $f16, $f20                          \n\t"
+    "punpcklwd  $f24, $f24, $f28                          \n\t"
+    "gssqc1     $f24, $f16, 0x0(%[pSad])                  \n\t"
+    : [pSample1]"+&r"((unsigned char *)pSample1),
+      [pSample2]"+&r"((unsigned char *)pSample2)
+    : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2),
+      [pSad]"r"((int *)pSad)
+    : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+      "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28","$f30"
+  );
+  RECOVER_REG;
+}
--- a/codec/common/src/expand_pic.cpp
+++ b/codec/common/src/expand_pic.cpp
@@ -140,6 +140,13 @@
     pExpandPicFunc->pfExpandChromaPicture[1] = ExpandPictureChroma_AArch64_neon;
   }
 #endif//HAVE_NEON_AARCH64
+#if defined(HAVE_MMI)
+  if (kuiCPUFlag & WELS_CPU_MMI) {
+    pExpandPicFunc->pfExpandLumaPicture      = ExpandPictureLuma_mmi;
+    pExpandPicFunc->pfExpandChromaPicture[0] = ExpandPictureChromaUnalign_mmi;
+    pExpandPicFunc->pfExpandChromaPicture[1] = ExpandPictureChromaAlign_mmi;
+  }
+#endif//HAVE_MMI
 }
 
 
--- a/codec/encoder/core/inc/get_intra_predictor.h
+++ b/codec/encoder/core/inc/get_intra_predictor.h
@@ -153,6 +153,16 @@
 void WelsIChromaPredPlane_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 void WelsIChromaPredDcTop_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
 #endif//HAVE_NEON_AARCH64
+
+#if defined(HAVE_MMI)
+void WelsI16x16LumaPredDc_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredPlane_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+
+void WelsIChromaPredH_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChromaPredV_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChromaPredDc_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChromaPredPlane_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+#endif//HAVE_MMI
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
--- a/codec/encoder/core/inc/sample.h
+++ b/codec/encoder/core/inc/sample.h
@@ -124,6 +124,14 @@
 int32_t WelsIntra4x4Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t, int32_t,
                                             int32_t);
 #endif
+
+#if defined (HAVE_MMI)
+int32_t WelsSampleSatd8x8_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd16x8_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd8x16_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd16x16_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd4x4_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+#endif//HAVE_MMI
 #if defined(__cplusplus)
 }
 #endif//__cplusplus
--- a/codec/encoder/core/src/deblocking.cpp
+++ b/codec/encoder/core/src/deblocking.cpp
@@ -778,6 +778,11 @@
     *pfSetNZCZero = WelsNonZeroCount_sse2;
   }
 #endif
+#if defined(HAVE_MMI)
+  if (iCpu & WELS_CPU_MMI) {
+    *pfSetNZCZero = WelsNonZeroCount_mmi;
+  }
+#endif
 }
 
 void  DeblockingInit (DeblockingFunc*   pFunc,  int32_t iCpu) {
@@ -842,6 +847,19 @@
 #endif
   }
 #endif
+
+#if defined(HAVE_MMI)
+  if (iCpu & WELS_CPU_MMI) {
+    pFunc->pfLumaDeblockingLT4Ver   = DeblockLumaLt4V_mmi;
+    pFunc->pfLumaDeblockingEQ4Ver   = DeblockLumaEq4V_mmi;
+    pFunc->pfLumaDeblockingLT4Hor   = DeblockLumaLt4H_mmi;
+    pFunc->pfLumaDeblockingEQ4Hor   = DeblockLumaEq4H_mmi;
+    pFunc->pfChromaDeblockingLT4Ver = DeblockChromaLt4V_mmi;
+    pFunc->pfChromaDeblockingEQ4Ver = DeblockChromaEq4V_mmi;
+    pFunc->pfChromaDeblockingLT4Hor = DeblockChromaLt4H_mmi;
+    pFunc->pfChromaDeblockingEQ4Hor = DeblockChromaEq4H_mmi;
+  }
+#endif//HAVE_MMI
 }
 
 
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -586,5 +586,16 @@
     pFuncList->pfDctFourT4              = WelsDctFourT4_AArch64_neon;
   }
 #endif
+
+#if defined(HAVE_MMI)
+  if (uiCpuFlag & WELS_CPU_MMI) {
+    pFuncList->pfCopy8x8Aligned         = WelsCopy8x8_mmi;
+    pFuncList->pfCopy8x16Aligned        = WelsCopy8x16_mmi;
+
+    pFuncList->pfCopy16x16Aligned       = WelsCopy16x16_mmi;
+    pFuncList->pfCopy16x16NotAligned    = WelsCopy16x16NotAligned_mmi;
+    pFuncList->pfCopy16x8NotAligned     = WelsCopy16x8NotAligned_mmi;
+  }
+#endif//HAVE_MMI
 }
 }
--- a/codec/encoder/core/src/get_intra_predictor.cpp
+++ b/codec/encoder/core/src/get_intra_predictor.cpp
@@ -720,5 +720,19 @@
     pFuncList->pfGetChromaPred[C_PRED_P]    = WelsIChromaPredPlane_sse2;
   }
 #endif
+
+#if defined(HAVE_MMI)
+  if (kuiCpuFlag & WELS_CPU_MMI) {
+    pFuncList->pfGetLumaI16x16Pred[I16_PRED_V] = WelsI16x16LumaPredV_mmi;
+    pFuncList->pfGetLumaI16x16Pred[I16_PRED_H] = WelsI16x16LumaPredH_mmi;
+    pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] = WelsI16x16LumaPredDc_mmi;
+    pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] = WelsI16x16LumaPredPlane_mmi;
+
+    pFuncList->pfGetChromaPred[C_PRED_H] = WelsIChromaPredH_mmi;
+    pFuncList->pfGetChromaPred[C_PRED_DC]   = WelsIChromaPredDc_mmi;
+    pFuncList->pfGetChromaPred[C_PRED_V]    = WelsIChromaPredV_mmi;
+    pFuncList->pfGetChromaPred[C_PRED_P]    = WelsIChromaPredPlane_mmi;
+  }
+#endif//HAVE_MMI
 }
 }
--- a/codec/encoder/core/src/sample.cpp
+++ b/codec/encoder/core/src/sample.cpp
@@ -469,6 +469,27 @@
     pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad  = WelsIntra16x16Combined3Sad_AArch64_neon;
   }
 #endif
+
+#if defined (HAVE_MMI)
+  if (uiCpuFlag & WELS_CPU_MMI) {
+    pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_mmi;
+    pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_mmi;
+    pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16] = WelsSampleSad8x16_mmi;
+    pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] = WelsSampleSad8x8_mmi;
+    pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4  ] = WelsSampleSad4x4_mmi;
+
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4  ] = WelsSampleSatd4x4_mmi;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8  ] = WelsSampleSatd8x8_mmi;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_mmi;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_mmi;
+    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_mmi;
+
+    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_mmi;
+    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_mmi;
+    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_mmi;
+    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_mmi;
+  }
+#endif//HAVE_MMI
 }
 
 } // namespace WelsEnc
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -271,6 +271,11 @@
 GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16NotAligned_sse2);
 GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16_sse2);
 #endif
+#ifdef HAVE_MMI
+GENERATE_UT_FOR_COPY (16, 8, WelsCopy16x8NotAligned_mmi);
+GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16NotAligned_mmi);
+GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16_mmi);
+#endif
 
 namespace {
 
--- a/test/encoder/EncUT_Sample.cpp
+++ b/test/encoder/EncUT_Sample.cpp
@@ -672,6 +672,20 @@
 GENERATE_Sad16x16_UT (WelsSampleSatd16x16_AArch64_neon, WelsSampleSatd16x16_c, WELS_CPU_NEON)
 #endif
 
+#ifdef HAVE_MMI
+GENERATE_Sad4x4_UT (WelsSampleSad4x4_mmi, WelsSampleSad4x4_c, WELS_CPU_MMI)
+GENERATE_Sad8x8_UT (WelsSampleSad8x8_mmi, WelsSampleSad8x8_c, WELS_CPU_MMI)
+GENERATE_Sad8x16_UT (WelsSampleSad8x16_mmi, WelsSampleSad8x16_c, WELS_CPU_MMI)
+GENERATE_Sad16x8_UT (WelsSampleSad16x8_mmi, WelsSampleSad16x8_c, WELS_CPU_MMI)
+GENERATE_Sad16x16_UT (WelsSampleSad16x16_mmi, WelsSampleSad16x16_c, WELS_CPU_MMI)
+
+GENERATE_Sad4x4_UT (WelsSampleSatd4x4_mmi, WelsSampleSatd4x4_c, WELS_CPU_MMI)
+GENERATE_Sad8x8_UT (WelsSampleSatd8x8_mmi, WelsSampleSatd8x8_c, WELS_CPU_MMI)
+GENERATE_Sad8x16_UT (WelsSampleSatd8x16_mmi, WelsSampleSatd8x16_c, WELS_CPU_MMI)
+GENERATE_Sad16x8_UT (WelsSampleSatd16x8_mmi, WelsSampleSatd16x8_c, WELS_CPU_MMI)
+GENERATE_Sad16x16_UT (WelsSampleSatd16x16_mmi, WelsSampleSatd16x16_c, WELS_CPU_MMI)
+#endif
+
 #define GENERATE_SadFour_UT(func, CPUFLAGS, width, height) \
 TEST_F (SadSatdAssemblyFuncTest, func) { \
   if (0 == (m_uiCpuFeatureFlag & CPUFLAGS)) \
@@ -719,4 +733,11 @@
 GENERATE_SadFour_UT (WelsSampleSadFour8x16_AArch64_neon, WELS_CPU_NEON, 8, 16)
 GENERATE_SadFour_UT (WelsSampleSadFour16x8_AArch64_neon, WELS_CPU_NEON, 16, 8)
 GENERATE_SadFour_UT (WelsSampleSadFour16x16_AArch64_neon, WELS_CPU_NEON, 16, 16)
+#endif
+
+#ifdef HAVE_MMI
+GENERATE_SadFour_UT (WelsSampleSadFour8x8_mmi, WELS_CPU_MMI, 8, 8)
+GENERATE_SadFour_UT (WelsSampleSadFour8x16_mmi, WELS_CPU_MMI, 8, 16)
+GENERATE_SadFour_UT (WelsSampleSadFour16x8_mmi, WELS_CPU_MMI, 16, 8)
+GENERATE_SadFour_UT (WelsSampleSadFour16x16_mmi, WELS_CPU_MMI, 16, 16)
 #endif