ref: 9e2abda78f0fc0e6a4c2a7a3e2e3067404acdade
parent: 07cb68d0a6e0a4dd71787021e57bc59fd3fcc543
author: gxw <[email protected]>
date: Fri Aug 10 07:22:31 EDT 2018
Add optimization files in codec/common/mips 1. Add copy_mb_mmi.c, expand_picture_mmi.c, satd_sad_mmi.c and intra_pred_com_mmi.c in codec/common/mips 2. Modify codec/common/inc/asmdefs_mmi.c format Change-Id: I065cdb7574067abfbd8701fe57d2a4fef043d398
--- a/codec/common/inc/asmdefs_mmi.h
+++ b/codec/common/inc/asmdefs_mmi.h
@@ -35,303 +35,306 @@
#define CACHE_LINE_SIZE 32
-#if defined(__mips64) && defined(__LP64__)
+#if defined(_ABI64) && _MIPS_SIM == _ABI64
# define mips_reg int64_t
+# define PTRSIZE " 8 "
+# define PTRLOG " 3 "
# define PTR_ADDU "daddu "
# define PTR_ADDIU "daddiu "
# define PTR_ADDI "daddi "
# define PTR_SUBU "dsubu "
# define PTR_L "ld "
+# define PTR_S "sd "
# define PTR_SRA "dsra "
# define PTR_SRL "dsrl "
# define PTR_SLL "dsll "
-# define PTR_MTC1 "dmtc1 "
-# define PTR_LI "dli "
#else
# define mips_reg int32_t
+# define PTRSIZE " 4 "
+# define PTRLOG " 2 "
# define PTR_ADDU "addu "
# define PTR_ADDIU "addiu "
# define PTR_ADDI "addi "
# define PTR_SUBU "subu "
# define PTR_L "lw "
+# define PTR_S "sw "
# define PTR_SRA "sra "
# define PTR_SRL "srl "
# define PTR_SLL "sll "
-# define PTR_MTC1 "mtc1 "
-# define PTR_LI "li "
#endif
#define MMI_XSawp_BH(f0, f2, f4, f6, f8, f10) \
- "mov.d "#f8", "#f2" \n\t" \
- "punpckhbh "#f2", "#f0", "#f4" \n\t" \
- "punpcklbh "#f0", "#f0", "#f4" \n\t" \
- "punpckhbh "#f10", "#f8", "#f6" \n\t" \
- "punpcklbh "#f8", "#f8", "#f6" \n\t"
+ "mov.d "#f8", "#f2" \n\t" \
+ "punpckhbh "#f2", "#f0", "#f4" \n\t" \
+ "punpcklbh "#f0", "#f0", "#f4" \n\t" \
+ "punpckhbh "#f10", "#f8", "#f6" \n\t" \
+ "punpcklbh "#f8", "#f8", "#f6" \n\t"
#define MMI_XSawp_HW(f0, f2, f4, f6, f8, f10) \
- "mov.d "#f8", "#f2" \n\t" \
- "punpckhhw "#f2", "#f0", "#f4" \n\t" \
- "punpcklhw "#f0", "#f0", "#f4" \n\t" \
- "punpckhhw "#f10", "#f8", "#f6" \n\t" \
- "punpcklhw "#f8", "#f8", "#f6" \n\t"
+ "mov.d "#f8", "#f2" \n\t" \
+ "punpckhhw "#f2", "#f0", "#f4" \n\t" \
+ "punpcklhw "#f0", "#f0", "#f4" \n\t" \
+ "punpckhhw "#f10", "#f8", "#f6" \n\t" \
+ "punpcklhw "#f8", "#f8", "#f6" \n\t"
#define MMI_XSawp_WD(f0, f2, f4, f6, f8, f10) \
- "mov.d "#f8", "#f2" \n\t" \
- "punpckhwd "#f2", "#f0", "#f4" \n\t" \
- "punpcklwd "#f0", "#f0", "#f4" \n\t" \
- "punpckhwd "#f10", "#f8", "#f6" \n\t" \
- "punpcklwd "#f8", "#f8", "#f6" \n\t"
+ "mov.d "#f8", "#f2" \n\t" \
+ "punpckhwd "#f2", "#f0", "#f4" \n\t" \
+ "punpcklwd "#f0", "#f0", "#f4" \n\t" \
+ "punpckhwd "#f10", "#f8", "#f6" \n\t" \
+ "punpcklwd "#f8", "#f8", "#f6" \n\t"
#define MMI_XSawp_DQ(f0, f2, f4, f6, f8, f10) \
- "mov.d "#f8", "#f2" \n\t" \
- "mov.d "#f2", "#f4" \n\t" \
- "mov.d "#f10", "#f6" \n\t"
+ "mov.d "#f8", "#f2" \n\t" \
+ "mov.d "#f2", "#f4" \n\t" \
+ "mov.d "#f10", "#f6" \n\t"
#define WELS_AbsH(f0, f2, f4, f6, f8, f10) \
- "xor "#f8", "#f8", "#f8" \n\t" \
- "psubh "#f10", "#f8", "#f6" \n\t" \
- "psubh "#f8", "#f8", "#f4" \n\t" \
- "pmaxsh "#f0", "#f4", "#f8" \n\t" \
- "pmaxsh "#f2", "#f6", "#f10" \n\t"
+ "xor "#f8", "#f8", "#f8" \n\t" \
+ "psubh "#f10", "#f8", "#f6" \n\t" \
+ "psubh "#f8", "#f8", "#f4" \n\t" \
+ "pmaxsh "#f0", "#f4", "#f8" \n\t" \
+ "pmaxsh "#f2", "#f6", "#f10" \n\t"
#define MMI_SumSub(f0, f2, f4, f6, f8, f10) \
- "mov.d "#f8", "#f4" \n\t" \
- "mov.d "#f10", "#f6" \n\t" \
- "paddh "#f4", "#f4", "#f0" \n\t" \
- "paddh "#f6", "#f6", "#f2" \n\t" \
- "psubh "#f0", "#f0", "#f8" \n\t" \
- "psubh "#f2", "#f2", "#f10" \n\t"
+ "mov.d "#f8", "#f4" \n\t" \
+ "mov.d "#f10", "#f6" \n\t" \
+ "paddh "#f4", "#f4", "#f0" \n\t" \
+ "paddh "#f6", "#f6", "#f2" \n\t" \
+ "psubh "#f0", "#f0", "#f8" \n\t" \
+ "psubh "#f2", "#f2", "#f10" \n\t"
#define MMI_LoadDiff8P(f0, f2, f4, f6, f8, r0, r1) \
- "gsldlc1 "#f0", 0x7("#r0") \n\t" \
- "gsldlc1 "#f4", 0x7("#r1") \n\t" \
- "gsldrc1 "#f0", 0x0("#r0") \n\t" \
- "gsldrc1 "#f4", 0x0("#r1") \n\t" \
- "punpckhbh "#f2", "#f0", "#f8" \n\t" \
- "punpcklbh "#f0", "#f0", "#f8" \n\t" \
- "punpckhbh "#f6", "#f4", "#f8" \n\t" \
- "punpcklbh "#f4", "#f4", "#f8" \n\t" \
- "psubh "#f0", "#f0", "#f4" \n\t" \
- "psubh "#f2", "#f2", "#f6" \n\t"
+ "gsldlc1 "#f0", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r1") \n\t" \
+ "gsldrc1 "#f0", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r1") \n\t" \
+ "punpckhbh "#f2", "#f0", "#f8" \n\t" \
+ "punpcklbh "#f0", "#f0", "#f8" \n\t" \
+ "punpckhbh "#f6", "#f4", "#f8" \n\t" \
+ "punpcklbh "#f4", "#f4", "#f8" \n\t" \
+ "psubh "#f0", "#f0", "#f4" \n\t" \
+ "psubh "#f2", "#f2", "#f6" \n\t"
#define MMI_TransTwo4x4H(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
- MMI_XSawp_HW(f0, f2, f4, f6, f16, f18) \
- MMI_XSawp_HW(f8, f10, f12, f14, f4, f6) \
- MMI_XSawp_WD(f0, f2, f8, f10, f12, f14) \
- MMI_XSawp_WD(f16, f18, f4, f6, f8, f10) \
- MMI_XSawp_DQ(f0, f2, f16, f18, f4, f6) \
- MMI_XSawp_DQ(f12, f14, f8, f10, f16, f18)
+ MMI_XSawp_HW(f0, f2, f4, f6, f16, f18) \
+ MMI_XSawp_HW(f8, f10, f12, f14, f4, f6) \
+ MMI_XSawp_WD(f0, f2, f8, f10, f12, f14) \
+ MMI_XSawp_WD(f16, f18, f4, f6, f8, f10) \
+ MMI_XSawp_DQ(f0, f2, f16, f18, f4, f6) \
+ MMI_XSawp_DQ(f12, f14, f8, f10, f16, f18)
#define MMI_TransTwo8x8B(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22, f24, f26, f28, f30, r0, r1) \
- "dmfc1 "#r0", "#f28" \n\t" \
- "dmfc1 "#r1", "#f30" \n\t" \
- MMI_XSawp_BH(f0, f2, f4, f6, f28, f30) \
- MMI_XSawp_BH(f8, f10, f12, f14, f4, f6) \
- MMI_XSawp_BH(f16, f18, f20, f22, f12, f14) \
- "dmtc1 "#r0", "#f20" \n\t" \
- "dmtc1 "#r1", "#f22" \n\t" \
- "dmfc1 "#r0", "#f12" \n\t" \
- "dmfc1 "#r1", "#f14" \n\t" \
- MMI_XSawp_BH(f24, f26, f20, f22, f12, f14) \
- MMI_XSawp_HW(f0, f2, f8, f10, f20, f22) \
- MMI_XSawp_HW(f28, f30, f4, f6, f8, f10) \
- MMI_XSawp_HW(f16, f18, f24, f26, f4, f6) \
- "dmtc1 "#r0", "#f24" \n\t" \
- "dmtc1 "#r1", "#f26" \n\t" \
- "dmfc1 "#r0", "#f8" \n\t" \
- "dmfc1 "#r1", "#f10" \n\t" \
- MMI_XSawp_HW(f24, f26, f12, f14, f8, f10) \
- MMI_XSawp_WD(f0, f2, f16, f18, f12, f14) \
- MMI_XSawp_WD(f20, f22, f4, f6, f16, f18) \
- MMI_XSawp_WD(f28, f30, f24, f26, f4, f6) \
- "dmtc1 "#r0", "#f24" \n\t" \
- "dmtc1 "#r1", "#f26" \n\t" \
- "dmfc1 "#r0", "#f16" \n\t" \
- "dmfc1 "#r1", "#f18" \n\t" \
- MMI_XSawp_WD(f24, f26, f8, f10, f16, f18) \
- MMI_XSawp_DQ(f0, f2, f28, f30, f8, f10) \
- MMI_XSawp_DQ(f12, f14, f4, f6, f28, f30) \
- MMI_XSawp_DQ(f20, f22, f24, f26, f4, f6) \
- "dmtc1 "#r0", "#f24" \n\t" \
- "dmtc1 "#r1", "#f26" \n\t" \
- "dmfc1 "#r0", "#f0" \n\t" \
- "dmfc1 "#r1", "#f2" \n\t" \
- MMI_XSawp_DQ(f24, f26, f16, f18, f0, f2) \
- "dmtc1 "#r0", "#f16" \n\t" \
- "dmtc1 "#r1", "#f18" \n\t"
+ "dmfc1 "#r0", "#f28" \n\t" \
+ "dmfc1 "#r1", "#f30" \n\t" \
+ MMI_XSawp_BH(f0, f2, f4, f6, f28, f30) \
+ MMI_XSawp_BH(f8, f10, f12, f14, f4, f6) \
+ MMI_XSawp_BH(f16, f18, f20, f22, f12, f14) \
+ "dmtc1 "#r0", "#f20" \n\t" \
+ "dmtc1 "#r1", "#f22" \n\t" \
+ "dmfc1 "#r0", "#f12" \n\t" \
+ "dmfc1 "#r1", "#f14" \n\t" \
+ MMI_XSawp_BH(f24, f26, f20, f22, f12, f14) \
+ MMI_XSawp_HW(f0, f2, f8, f10, f20, f22) \
+ MMI_XSawp_HW(f28, f30, f4, f6, f8, f10) \
+ MMI_XSawp_HW(f16, f18, f24, f26, f4, f6) \
+ "dmtc1 "#r0", "#f24" \n\t" \
+ "dmtc1 "#r1", "#f26" \n\t" \
+ "dmfc1 "#r0", "#f8" \n\t" \
+ "dmfc1 "#r1", "#f10" \n\t" \
+ MMI_XSawp_HW(f24, f26, f12, f14, f8, f10) \
+ MMI_XSawp_WD(f0, f2, f16, f18, f12, f14) \
+ MMI_XSawp_WD(f20, f22, f4, f6, f16, f18) \
+ MMI_XSawp_WD(f28, f30, f24, f26, f4, f6) \
+ "dmtc1 "#r0", "#f24" \n\t" \
+ "dmtc1 "#r1", "#f26" \n\t" \
+ "dmfc1 "#r0", "#f16" \n\t" \
+ "dmfc1 "#r1", "#f18" \n\t" \
+ MMI_XSawp_WD(f24, f26, f8, f10, f16, f18) \
+ MMI_XSawp_DQ(f0, f2, f28, f30, f8, f10) \
+ MMI_XSawp_DQ(f12, f14, f4, f6, f28, f30) \
+ MMI_XSawp_DQ(f20, f22, f24, f26, f4, f6) \
+ "dmtc1 "#r0", "#f24" \n\t" \
+ "dmtc1 "#r1", "#f26" \n\t" \
+ "dmfc1 "#r0", "#f0" \n\t" \
+ "dmfc1 "#r1", "#f2" \n\t" \
+ MMI_XSawp_DQ(f24, f26, f16, f18, f0, f2) \
+ "dmtc1 "#r0", "#f16" \n\t" \
+ "dmtc1 "#r1", "#f18" \n\t"
#define MMI_XSwap_HW_SINGLE(f0, f2, f4) \
- "mov.d "#f4", "#f0" \n\t" \
- "punpckhhw "#f4", "#f4", "#f2" \n\t" \
- "punpcklhw "#f0", "#f0", "#f2" \n\t"
+ "mov.d "#f4", "#f0" \n\t" \
+ "punpckhhw "#f4", "#f4", "#f2" \n\t" \
+ "punpcklhw "#f0", "#f0", "#f2" \n\t"
#define MMI_XSwap_WD_SINGLE(f0, f2, f4) \
- "mov.d "#f4", "#f0" \n\t" \
- "punpckhwd "#f4", "#f4", "#f2" \n\t" \
- "punpcklwd "#f0", "#f0", "#f2" \n\t"
+ "mov.d "#f4", "#f0" \n\t" \
+ "punpckhwd "#f4", "#f4", "#f2" \n\t" \
+ "punpcklwd "#f0", "#f0", "#f2" \n\t"
#define MMI_Trans4x4H_SINGLE(f0, f2, f4, f6, f8) \
- MMI_XSwap_HW_SINGLE(f0, f2, f8) \
- MMI_XSwap_HW_SINGLE(f4, f6, f2) \
- MMI_XSwap_WD_SINGLE(f0, f4, f6) \
- MMI_XSwap_WD_SINGLE(f8, f2, f4)
+ MMI_XSwap_HW_SINGLE(f0, f2, f8) \
+ MMI_XSwap_HW_SINGLE(f4, f6, f2) \
+ MMI_XSwap_WD_SINGLE(f0, f4, f6) \
+ MMI_XSwap_WD_SINGLE(f8, f2, f4)
#define MMI_SumSub_SINGLE(f0, f2, f4) \
- "mov.d "#f4", "#f2" \n\t" \
- "psubh "#f2", "#f2", "#f0" \n\t" \
- "paddh "#f0", "#f0", "#f4" \n\t"
+ "mov.d "#f4", "#f2" \n\t" \
+ "psubh "#f2", "#f2", "#f0" \n\t" \
+ "paddh "#f0", "#f0", "#f4" \n\t"
#define MMI_SumSubMul2_SINGLE(f0, f2, f4, f6) \
- "mov.d "#f4", "#f0" \n\t" \
- "psllh "#f0", "#f0", "#f6" \n\t" \
- "paddh "#f0", "#f0", "#f2" \n\t" \
- "psllh "#f2", "#f2", "#f6" \n\t" \
- "psubh "#f4", "#f4", "#f2" \n\t"
+ "mov.d "#f4", "#f0" \n\t" \
+ "psllh "#f0", "#f0", "#f6" \n\t" \
+ "paddh "#f0", "#f0", "#f2" \n\t" \
+ "psllh "#f2", "#f2", "#f6" \n\t" \
+ "psubh "#f4", "#f4", "#f2" \n\t"
//f4 should be 0x0
#define MMI_Copy8Times(f0, f2, f4, r0) \
- "dmtc1 "#r0", "#f0" \n\t" \
- "pshufh "#f0", "#f0", "#f4" \n\t" \
- "mov.d "#f2", "#f0" \n\t"
+ "dmtc1 "#r0", "#f0" \n\t" \
+ "pshufh "#f0", "#f0", "#f4" \n\t" \
+ "mov.d "#f2", "#f0" \n\t"
//f4 should be 0x0
#define MMI_Copy16Times(f0, f2, f4, r0) \
- "dmtc1 "#r0", "#f0" \n\t" \
- "punpcklbh "#f0", "#f0", "#f0" \n\t" \
- "pshufh "#f0", "#f0", "#f4" \n\t" \
- "mov.d "#f2", "#f0" \n\t"
+ "dmtc1 "#r0", "#f0" \n\t" \
+ "punpcklbh "#f0", "#f0", "#f0" \n\t" \
+ "pshufh "#f0", "#f0", "#f4" \n\t" \
+ "mov.d "#f2", "#f0" \n\t"
#define MMI_SumSubDiv2_SINGLE(f0, f2, f4, f6) \
- "psrah "#f4", "#f2", "#f6" \n\t" \
- "paddh "#f4", "#f4", "#f0" \n\t" \
- "psrah "#f0", "#f0", "#f6" \n\t" \
- "psubh "#f0", "#f0", "#f2" \n\t"
+ "psrah "#f4", "#f2", "#f6" \n\t" \
+ "paddh "#f4", "#f4", "#f0" \n\t" \
+ "psrah "#f0", "#f0", "#f6" \n\t" \
+ "psubh "#f0", "#f0", "#f2" \n\t"
#define MMI_IDCT_SINGLE(f0, f2, f4, f6, f8, f10, f12) \
- MMI_SumSub_SINGLE(f6, f8, f10) \
- MMI_SumSubDiv2_SINGLE(f4, f2, f0, f12) \
- MMI_SumSub_SINGLE(f0, f6, f10) \
- MMI_SumSub_SINGLE(f4, f8, f10)
+ MMI_SumSub_SINGLE(f6, f8, f10) \
+ MMI_SumSubDiv2_SINGLE(f4, f2, f0, f12) \
+ MMI_SumSub_SINGLE(f0, f6, f10) \
+ MMI_SumSub_SINGLE(f4, f8, f10)
#define MMI_StoreDiff4P_SINGLE(f0, f2, f4, f6, r0, r1, f8) \
- "gsldlc1 "#f2", 0x7("#r1") \n\t" \
- "gsldrc1 "#f2", 0x0("#r1") \n\t" \
- "punpcklbh "#f2", "#f2", "#f6" \n\t" \
- "paddh "#f0", "#f0", "#f4" \n\t" \
- "psrah "#f0", "#f0", "#f8" \n\t" \
- "paddsh "#f0", "#f0", "#f2" \n\t" \
- "packushb "#f0", "#f0", "#f2" \n\t" \
- "gsswlc1 "#f0", 0x3("#r0") \n\t" \
- "gsswrc1 "#f0", 0x0("#r0") \n\t"
+ "gsldlc1 "#f2", 0x7("#r1") \n\t" \
+ "gsldrc1 "#f2", 0x0("#r1") \n\t" \
+ "punpcklbh "#f2", "#f2", "#f6" \n\t" \
+ "paddh "#f0", "#f0", "#f4" \n\t" \
+ "psrah "#f0", "#f0", "#f8" \n\t" \
+ "paddsh "#f0", "#f0", "#f2" \n\t" \
+ "packushb "#f0", "#f0", "#f2" \n\t" \
+ "gsswlc1 "#f0", 0x3("#r0") \n\t" \
+ "gsswrc1 "#f0", 0x0("#r0") \n\t"
#define SUMH_HORIZON(f0, f2, f4, f6, f8) \
- "paddh "#f0", "#f0", "#f2" \n\t" \
- "punpckhhw "#f2", "#f0", "#f8" \n\t" \
- "punpcklhw "#f0", "#f0", "#f8" \n\t" \
- "paddw "#f0", "#f0", "#f2" \n\t" \
- "punpckhwd "#f2", "#f0", "#f0" \n\t" \
- "paddw "#f0", "#f0", "#f2" \n\t"
+ "paddh "#f0", "#f0", "#f2" \n\t" \
+ "punpckhhw "#f2", "#f0", "#f8" \n\t" \
+ "punpcklhw "#f0", "#f0", "#f8" \n\t" \
+ "paddw "#f0", "#f0", "#f2" \n\t" \
+ "punpckhwd "#f2", "#f0", "#f0" \n\t" \
+ "paddw "#f0", "#f0", "#f2" \n\t"
#define LOAD_COLUMN(f0, f2, f4, f6, f8, f10, f12, f14, r0, r1, r2) \
- "daddu "#r2", "#r0", "#r1" \n\t" \
- "gsldlc1 "#f0", 0x7("#r0") \n\t" \
- "gsldlc1 "#f4", 0x7("#r2") \n\t" \
- "gsldrc1 "#f0", 0x0("#r0") \n\t" \
- "gsldrc1 "#f4", 0x0("#r2") \n\t" \
- "punpcklbh "#f0", "#f0", "#f4" \n\t" \
- "daddu "#r0", "#r2", "#r1" \n\t" \
- "daddu "#r2", "#r0", "#r1" \n\t" \
- "gsldlc1 "#f8", 0x7("#r0") \n\t" \
- "gsldlc1 "#f4", 0x7("#r2") \n\t" \
- "gsldrc1 "#f8", 0x0("#r0") \n\t" \
- "gsldrc1 "#f4", 0x0("#r2") \n\t" \
- "punpcklbh "#f8", "#f8", "#f4" \n\t" \
- "punpckhhw "#f2", "#f0", "#f8" \n\t" \
- "punpcklhw "#f0", "#f0", "#f8" \n\t" \
- "daddu "#r0", "#r2", "#r1" \n\t" \
- "daddu "#r2", "#r0", "#r1" \n\t" \
- "gsldlc1 "#f12", 0x7("#r0") \n\t" \
- "gsldlc1 "#f4", 0x7("#r2") \n\t" \
- "gsldrc1 "#f12", 0x0("#r0") \n\t" \
- "gsldrc1 "#f4", 0x0("#r2") \n\t" \
- "punpcklbh "#f12", "#f12", "#f4" \n\t" \
- "daddu "#r0", "#r2", "#r1" \n\t" \
- "daddu "#r2", "#r0", "#r1" \n\t" \
- "gsldlc1 "#f8", 0x7("#r0") \n\t" \
- "gsldlc1 "#f4", 0x7("#r2") \n\t" \
- "gsldrc1 "#f8", 0x0("#r0") \n\t" \
- "gsldrc1 "#f4", 0x0("#r2") \n\t" \
- "punpcklbh "#f8", "#f8", "#f4" \n\t" \
- "punpckhhw "#f14", "#f12", "#f8" \n\t" \
- "punpcklhw "#f12", "#f12", "#f8" \n\t" \
- "daddu "#r0", "#r2", "#r1" \n\t" \
- "punpcklwd "#f0", "#f2", "#f14" \n\t" \
- "punpckhwd "#f2", "#f2", "#f14" \n\t"
+ "daddu "#r2", "#r0", "#r1" \n\t" \
+ "gsldlc1 "#f0", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r2") \n\t" \
+ "gsldrc1 "#f0", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r2") \n\t" \
+ "punpcklbh "#f0", "#f0", "#f4" \n\t" \
+ "daddu "#r0", "#r2", "#r1" \n\t" \
+ "daddu "#r2", "#r0", "#r1" \n\t" \
+ "gsldlc1 "#f8", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r2") \n\t" \
+ "gsldrc1 "#f8", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r2") \n\t" \
+ "punpcklbh "#f8", "#f8", "#f4" \n\t" \
+ "punpckhhw "#f2", "#f0", "#f8" \n\t" \
+ "punpcklhw "#f0", "#f0", "#f8" \n\t" \
+ "daddu "#r0", "#r2", "#r1" \n\t" \
+ "daddu "#r2", "#r0", "#r1" \n\t" \
+ "gsldlc1 "#f12", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r2") \n\t" \
+ "gsldrc1 "#f12", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r2") \n\t" \
+ "punpcklbh "#f12", "#f12", "#f4" \n\t" \
+ "daddu "#r0", "#r2", "#r1" \n\t" \
+ "daddu "#r2", "#r0", "#r1" \n\t" \
+ "gsldlc1 "#f8", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r2") \n\t" \
+ "gsldrc1 "#f8", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r2") \n\t" \
+ "punpcklbh "#f8", "#f8", "#f4" \n\t" \
+ "punpckhhw "#f14", "#f12", "#f8" \n\t" \
+ "punpcklhw "#f12", "#f12", "#f8" \n\t" \
+ "daddu "#r0", "#r2", "#r1" \n\t" \
+ "punpcklwd "#f0", "#f2", "#f14" \n\t" \
+ "punpckhwd "#f2", "#f2", "#f14" \n\t"
#define LOAD_COLUMN_C(f0, f2, f4, f6, r0, r1, r2) \
- "daddu "#r2", "#r0", "#r1" \n\t" \
- "gsldlc1 "#f0", 0x7("#r0") \n\t" \
- "gsldlc1 "#f2", 0x7("#r2") \n\t" \
- "gsldrc1 "#f0", 0x0("#r0") \n\t" \
- "gsldrc1 "#f2", 0x0("#r2") \n\t" \
- "punpcklbh "#f0", "#f0", "#f2" \n\t" \
- "daddu "#r0", "#r2", "#r1" \n\t" \
- "daddu "#r2", "#r0", "#r1" \n\t" \
- "gsldlc1 "#f4", 0x7("#r0") \n\t" \
- "gsldlc1 "#f2", 0x7("#r2") \n\t" \
- "gsldrc1 "#f4", 0x0("#r0") \n\t" \
- "gsldrc1 "#f2", 0x0("#r2") \n\t" \
- "punpcklbh "#f4", "#f4", "#f2" \n\t" \
- "punpckhhw "#f0", "#f0", "#f4" \n\t" \
- "daddu "#r0", "#r2", "#r1" \n\t"
+ "daddu "#r2", "#r0", "#r1" \n\t" \
+ "gsldlc1 "#f0", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f2", 0x7("#r2") \n\t" \
+ "gsldrc1 "#f0", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f2", 0x0("#r2") \n\t" \
+ "punpcklbh "#f0", "#f0", "#f2" \n\t" \
+ "daddu "#r0", "#r2", "#r1" \n\t" \
+ "daddu "#r2", "#r0", "#r1" \n\t" \
+ "gsldlc1 "#f4", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f2", 0x7("#r2") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f2", 0x0("#r2") \n\t" \
+ "punpcklbh "#f4", "#f4", "#f2" \n\t" \
+ "punpckhhw "#f0", "#f0", "#f4" \n\t" \
+ "daddu "#r0", "#r2", "#r1" \n\t"
+
/**
* backup register
*/
#define BACKUP_REG \
- double __back_temp[8]; \
- if (_MIPS_SIM == _ABI64) \
- __asm__ volatile ( \
- "gssqc1 $f25, $f24, 0x00(%[temp]) \n\t" \
- "gssqc1 $f27, $f26, 0x10(%[temp]) \n\t" \
- "gssqc1 $f29, $f28, 0x20(%[temp]) \n\t" \
- "gssqc1 $f31, $f30, 0x30(%[temp]) \n\t" \
- : \
- : [temp]"r"(__back_temp) \
- : "memory" \
- ); \
- else \
- __asm__ volatile ( \
- "gssqc1 $f22, $f20, 0x00(%[temp]) \n\t" \
- "gssqc1 $f26, $f24, 0x10(%[temp]) \n\t" \
- "gssqc1 $f30, $f28, 0x20(%[temp]) \n\t" \
- : \
- : [temp]"r"(__back_temp) \
- : "memory" \
- );
+ double __back_temp[8]; \
+ if (_MIPS_SIM == _ABI64) \
+ __asm__ volatile ( \
+ "gssqc1 $f25, $f24, 0x00(%[temp]) \n\t" \
+ "gssqc1 $f27, $f26, 0x10(%[temp]) \n\t" \
+ "gssqc1 $f29, $f28, 0x20(%[temp]) \n\t" \
+ "gssqc1 $f31, $f30, 0x30(%[temp]) \n\t" \
+ : \
+ : [temp]"r"(__back_temp) \
+ : "memory" \
+ ); \
+ else \
+ __asm__ volatile ( \
+ "gssqc1 $f22, $f20, 0x00(%[temp]) \n\t" \
+ "gssqc1 $f26, $f24, 0x10(%[temp]) \n\t" \
+ "gssqc1 $f30, $f28, 0x20(%[temp]) \n\t" \
+ : \
+ : [temp]"r"(__back_temp) \
+ : "memory" \
+ );
/**
* recover register
*/
#define RECOVER_REG \
- if (_MIPS_SIM == _ABI64) \
- __asm__ volatile ( \
- "gslqc1 $f25, $f24, 0x00(%[temp]) \n\t" \
- "gslqc1 $f27, $f26, 0x10(%[temp]) \n\t" \
- "gslqc1 $f29, $f28, 0x20(%[temp]) \n\t" \
- "gslqc1 $f31, $f30, 0x30(%[temp]) \n\t" \
- : \
- : [temp]"r"(__back_temp) \
- : "memory" \
- ); \
- else \
- __asm__ volatile ( \
- "gslqc1 $f22, $f20, 0x00(%[temp]) \n\t" \
- "gslqc1 $f26, $f24, 0x10(%[temp]) \n\t" \
- "gslqc1 $f30, $f28, 0x20(%[temp]) \n\t" \
- : \
- : [temp]"r"(__back_temp) \
- : "memory" \
- );
+ if (_MIPS_SIM == _ABI64) \
+ __asm__ volatile ( \
+ "gslqc1 $f25, $f24, 0x00(%[temp]) \n\t" \
+ "gslqc1 $f27, $f26, 0x10(%[temp]) \n\t" \
+ "gslqc1 $f29, $f28, 0x20(%[temp]) \n\t" \
+ "gslqc1 $f31, $f30, 0x30(%[temp]) \n\t" \
+ : \
+ : [temp]"r"(__back_temp) \
+ : "memory" \
+ ); \
+ else \
+ __asm__ volatile ( \
+ "gslqc1 $f22, $f20, 0x00(%[temp]) \n\t" \
+ "gslqc1 $f26, $f24, 0x10(%[temp]) \n\t" \
+ "gslqc1 $f30, $f28, 0x20(%[temp]) \n\t" \
+ : \
+ : [temp]"r"(__back_temp) \
+ : "memory" \
+ );
# define OK 1
# define NOTOK 0
--- a/codec/common/inc/copy_mb.h
+++ b/codec/common/inc/copy_mb.h
@@ -75,6 +75,13 @@
void WelsCopy8x16_AArch64_neon (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
#endif
+#if defined (HAVE_MMI)
+void WelsCopy8x8_mmi (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+void WelsCopy8x16_mmi (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
+void WelsCopy16x8NotAligned_mmi (uint8_t* Dst, int32_t iStrideD, uint8_t* Src, int32_t iStrideS);
+void WelsCopy16x16_mmi (uint8_t* Dst, int32_t iStrideD, uint8_t* Src, int32_t iStrideS);
+void WelsCopy16x16NotAligned_mmi (uint8_t* Dst, int32_t iStrideD, uint8_t* Src, int32_t iStrideS);
+#endif//HAVE_MMI
#if defined(__cplusplus)
}
#endif//__cplusplus
--- a/codec/common/inc/expand_pic.h
+++ b/codec/common/inc/expand_pic.h
@@ -73,6 +73,15 @@
const int32_t kiPicH);
#endif
+#if defined(HAVE_MMI)
+void ExpandPictureLuma_mmi (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicW,
+ const int32_t kiPicH);
+void ExpandPictureChromaAlign_mmi (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicW,
+ const int32_t kiPicH);
+void ExpandPictureChromaUnalign_mmi (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicW,
+ const int32_t kiPicH);
+#endif//HAVE_MMI
+
typedef void (*PExpandPictureFunc) (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH);
typedef struct TagExpandPicFunc {
--- a/codec/common/inc/intra_pred_common.h
+++ b/codec/common/inc/intra_pred_common.h
@@ -67,6 +67,11 @@
void WelsI16x16LumaPredV_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
void WelsI16x16LumaPredH_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
#endif//HAVE_NEON_AARCH64
+
+#if defined(HAVE_MMI)
+void WelsI16x16LumaPredV_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredH_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+#endif//HAVE_MMI
#if defined(__cplusplus)
}
#endif//__cplusplus
--- a/codec/common/inc/sad_common.h
+++ b/codec/common/inc/sad_common.h
@@ -104,6 +104,19 @@
void WelsSampleSadFour8x8_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
void WelsSampleSadFour4x4_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
#endif
+
+#if defined (HAVE_MMI)
+int32_t WelsSampleSad4x4_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad16x16_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad16x8_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad8x16_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSad8x8_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+
+void WelsSampleSadFour16x16_mmi (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour16x8_mmi (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour8x16_mmi (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+void WelsSampleSadFour8x8_mmi (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*);
+#endif//HAVE_MMI
#if defined(__cplusplus)
}
#endif//__cplusplus
--- /dev/null
+++ b/codec/common/mips/copy_mb_mmi.c
@@ -1,0 +1,477 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2018, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file copy_mb_mmi.c
+ *
+ * \brief Loongson optimization
+ *
+ * \date 20/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+void WelsCopy8x8_mmi(uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc,
+ int32_t iStrideS ) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f4, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f6, 0x7($8) \n\t"
+ "gsldrc1 $f4, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f6, 0x0($8) \n\t"
+ PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f8, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f10, 0x7($8) \n\t"
+ "gsldrc1 $f8, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f10, 0x0($8) \n\t"
+ PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f12, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f14, 0x7($8) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f14, 0x0($8) \n\t"
+
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f2, 0x7($8) \n\t"
+ "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f2, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f4, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f6, 0x7($8) \n\t"
+ "gssdrc1 $f4, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f6, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f8, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f10, 0x7($8) \n\t"
+ "gssdrc1 $f8, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f10, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f12, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f14, 0x7($8) \n\t"
+ "gssdrc1 $f12, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f14, 0x0($8) \n\t"
+ : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
+ : [iStrideD]"r"(iStrideD), [iStrideS]"r"(iStrideS)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
+ );
+}
+
+void WelsCopy8x16_mmi(uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc,
+ int32_t iStrideS) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f4, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f6, 0x7($8) \n\t"
+ "gsldrc1 $f4, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f6, 0x0($8) \n\t"
+ PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f8, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f10, 0x7($8) \n\t"
+ "gsldrc1 $f8, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f10, 0x0($8) \n\t"
+ PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f12, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f14, 0x7($8) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f14, 0x0($8) \n\t"
+ PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
+
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f2, 0x7($8) \n\t"
+ "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f2, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f4, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f6, 0x7($8) \n\t"
+ "gssdrc1 $f4, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f6, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f8, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f10, 0x7($8) \n\t"
+ "gssdrc1 $f8, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f10, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f12, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f14, 0x7($8) \n\t"
+ "gssdrc1 $f12, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f14, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
+
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f0, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f4, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f6, 0x7($8) \n\t"
+ "gsldrc1 $f4, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f6, 0x0($8) \n\t"
+ PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f8, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f10, 0x7($8) \n\t"
+ "gsldrc1 $f8, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f10, 0x0($8) \n\t"
+ PTR_ADDU "%[pSrc], $8, %[iStrideS] \n\t"
+ PTR_ADDU "$8, %[pSrc], %[iStrideS] \n\t"
+ "gsldlc1 $f12, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f14, 0x7($8) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f14, 0x0($8) \n\t"
+
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f0, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f2, 0x7($8) \n\t"
+ "gssdrc1 $f0, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f2, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f4, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f6, 0x7($8) \n\t"
+ "gssdrc1 $f4, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f6, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f8, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f10, 0x7($8) \n\t"
+ "gssdrc1 $f8, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f10, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iStrideD] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iStrideD] \n\t"
+ "gssdlc1 $f12, 0x7(%[pDst]) \n\t"
+ "gssdlc1 $f14, 0x7($8) \n\t"
+ "gssdrc1 $f12, 0x0(%[pDst]) \n\t"
+ "gssdrc1 $f14, 0x0($8) \n\t"
+ : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
+ : [iStrideD]"r"(iStrideD), [iStrideS]"r"(iStrideS)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14"
+ );
+}
+
+void WelsCopy16x16_mmi(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrc,
+ int32_t iSrcStride) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "gslqc1 $f0, $f2, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f4, $f6, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f8, $f10, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f12, $f14, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f16, $f18, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f20, $f22, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f24, $f26, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f28, $f30, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+
+ "gssqc1 $f0, $f2, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f4, $f6, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f8, $f10, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f12, $f14, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f16, $f18, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f20, $f22, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f24, $f26, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f28, $f30, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+
+ "gslqc1 $f0, $f2, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f4, $f6, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f8, $f10, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f12, $f14, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f16, $f18, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f20, $f22, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f24, $f26, 0x0(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gslqc1 $f28, $f30, 0x0(%[pSrc]) \n\t"
+
+ "gssqc1 $f0, $f2, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f4, $f6, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f8, $f10, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f12, $f14, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f16, $f18, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f20, $f22, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f24, $f26, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f28, $f30, 0x0(%[pDst]) \n\t"
+ : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
+ : [iDstStride]"r"((int)iDstStride), [iSrcStride]"r"((int)iSrcStride)
+ : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsCopy16x16NotAligned_mmi(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrc,
+ int32_t iSrcStride) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "gsldlc1 $f2, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f0, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f2, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f0, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f6, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f4, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f6, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f4, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f10, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f8, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f10, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f8, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f14, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f12, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f12, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f18, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f16, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f18, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f16, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f22, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f20, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f22, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f20, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f26, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f24, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f26, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f24, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f30, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f28, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f30, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f28, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+
+ PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f0, $f2, 0x0(%[pDst]) \n\t"
+ "gssqc1 $f4, $f6, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f8, $f10, 0x0(%[pDst]) \n\t"
+ "gssqc1 $f12, $f14, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f16, $f18, 0x0(%[pDst]) \n\t"
+ "gssqc1 $f20, $f22, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f24, $f26, 0x0(%[pDst]) \n\t"
+ "gssqc1 $f28, $f30, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
+
+ "gsldlc1 $f2, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f0, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f2, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f0, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f6, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f4, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f6, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f4, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f10, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f8, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f10, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f8, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f14, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f12, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f12, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f18, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f16, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f18, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f16, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f22, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f20, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f22, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f20, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f26, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f24, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f26, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f24, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f30, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f28, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f30, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f28, 0x8(%[pSrc]) \n\t"
+
+ PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f0, $f2, 0x0(%[pDst]) \n\t"
+ "gssqc1 $f4, $f6, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f8, $f10, 0x0(%[pDst]) \n\t"
+ "gssqc1 $f12, $f14, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f16, $f18, 0x0(%[pDst]) \n\t"
+ "gssqc1 $f20, $f22, 0x0($8) \n\t"
+ PTR_ADDU "%[pDst], $8, %[iDstStride] \n\t"
+ PTR_ADDU "$8, %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f24, $f26, 0x0(%[pDst]) \n\t"
+ "gssqc1 $f28, $f30, 0x0($8) \n\t"
+ : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
+ : [iDstStride]"r"((int)iDstStride), [iSrcStride]"r"((int)iSrcStride)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsCopy16x8NotAligned_mmi(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrc,
+ int32_t iSrcStride) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "gsldlc1 $f2, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f0, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f2, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f0, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f6, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f4, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f6, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f4, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f10, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f8, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f10, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f8, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f14, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f12, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f12, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f18, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f16, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f18, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f16, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f22, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f20, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f22, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f20, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f26, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f24, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f26, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f24, 0x8(%[pSrc]) \n\t"
+ PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t"
+ "gsldlc1 $f30, 0x7(%[pSrc]) \n\t"
+ "gsldlc1 $f28, 0xF(%[pSrc]) \n\t"
+ "gsldrc1 $f30, 0x0(%[pSrc]) \n\t"
+ "gsldrc1 $f28, 0x8(%[pSrc]) \n\t"
+
+ "gssqc1 $f0, $f2, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f4, $f6, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f8, $f10, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f12, $f14, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f16, $f18, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f20, $f22, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f24, $f26, 0x0(%[pDst]) \n\t"
+ PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t"
+ "gssqc1 $f28, $f30, 0x0(%[pDst]) \n\t"
+ : [pDst]"+&r"((unsigned char *)pDst), [pSrc]"+&r"((unsigned char *)pSrc)
+ : [iDstStride]"r"((int)iDstStride), [iSrcStride]"r"((int)iSrcStride)
+ : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
--- /dev/null
+++ b/codec/common/mips/expand_picture_mmi.c
@@ -1,0 +1,673 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2018, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file expand_picture_mmi.c
+ *
+ * \brief Loongson optimization
+ *
+ * \date 24/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+#define mov_line_8x4_mmi_aligned(r0, r1, f0) \
+ "gssdxc1 "#f0", 0x0("#r0", $0) \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdxc1 "#f0", 0x0("#r0", $0) \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdxc1 "#f0", 0x0("#r0", $0) \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdxc1 "#f0", 0x0("#r0", $0) \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t"
+
+#define mov_line_8x4_mmi_unaligned(r0, r1, f0) \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t"
+
+#define mov_line_end8x4_mmi_aligned(r0, r1, f0) \
+ "gssdxc1 "#f0", 0x0("#r0", $0) \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdxc1 "#f0", 0x0("#r0", $0) \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdxc1 "#f0", 0x0("#r0", $0) \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdxc1 "#f0", 0x0("#r0", $0) \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t"
+
+#define mov_line_end8x4_mmi_unaligned(r0, r1, f0) \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+
+#define mov_line_16x4_mmi_aligned(r0, r1, f0, f2) \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t"
+
+#define mov_line_16x4_mmi_unaligned(r0, r1, f0, f2) \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdlc1 "#f2", 0xF("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ "gssdrc1 "#f2", 0x8("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdlc1 "#f2", 0xF("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ "gssdrc1 "#f2", 0x8("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdlc1 "#f2", 0xF("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ "gssdrc1 "#f2", 0x8("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdlc1 "#f2", 0xF("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ "gssdrc1 "#f2", 0x8("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t"
+
+#define mov_line_end16x4_mmi_aligned(r0, r1, f0, f2) \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t"
+
+#define mov_line_end16x4_mmi_unaligned(r0, r1, f0, f2) \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdlc1 "#f2", 0xF("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ "gssdrc1 "#f2", 0x8("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdlc1 "#f2", 0xF("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ "gssdrc1 "#f2", 0x8("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdlc1 "#f2", 0xF("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ "gssdrc1 "#f2", 0x8("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssdlc1 "#f0", 0x7("#r0") \n\t" \
+ "gssdlc1 "#f2", 0xF("#r0") \n\t" \
+ "gssdrc1 "#f0", 0x0("#r0") \n\t" \
+ "gssdrc1 "#f2", 0x8("#r0") \n\t" \
+
+#define exp_top_bottom_mmi_32 \
+ "dsra %[iWidth], %[iWidth], 0x4 \n\t" \
+ "1: \n\t" \
+ "gslqc1 $f2, $f0, 0x0(%[pDst]) \n\t" \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ mov_line_end16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ "gslqc1 $f6, $f4, 0x0(%[iHeight]) \n\t" \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ mov_line_end16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ PTR_ADDIU "%[pDst], %[pDst], 0x10 \n\t" \
+ PTR_ADDIU "$9, $9, 0x10 \n\t" \
+ PTR_ADDIU "%[iHeight], %[iHeight], 0x10 \n\t" \
+ PTR_ADDIU "$11, $11, 0x10 \n\t" \
+ "dnegu %[iStride], %[iStride] \n\t" \
+ PTR_ADDIU "%[iWidth], %[iWidth], -0x1 \n\t" \
+ "bnez %[iWidth], 1b \n\t" \
+ "nop \n\t"
+
+#define exp_left_right_mmi_32 \
+ "2: \n\t" \
+ "lbu %[iWidth], 0x0(%[pDst]) \n\t" \
+ MMI_Copy16Times($f0, $f2, $f28, %[iWidth]) \
+ "gssqc1 $f2, $f0, 0x0($9) \n\t" \
+ "gssqc1 $f2, $f0, 0x10($9) \n\t" \
+ "lbu %[iWidth], 0x0(%[iHeight]) \n\t" \
+ MMI_Copy16Times($f4, $f6, $f28, %[iWidth]) \
+ "gssqc1 $f6, $f4, 0x0($11) \n\t" \
+ "gssqc1 $f6, $f4, 0x10($11) \n\t" \
+ PTR_ADDU "%[pDst], %[pDst], %[iStride] \n\t" \
+ PTR_ADDU "$9, $9, %[iStride] \n\t" \
+ PTR_ADDU "%[iHeight], %[iHeight], %[iStride] \n\t" \
+ PTR_ADDU "$11, $11, %[iStride] \n\t" \
+ PTR_ADDIU "$8, $8, -0x1 \n\t" \
+ "bnez $8, 2b \n\t" \
+ "nop \n\t"
+
+#define mov_line_32x4_mmi(r0, r1, f0, f2) \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ "gssqc1 "#f2", "#f0", 0x10("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ "gssqc1 "#f2", "#f0", 0x10("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ "gssqc1 "#f2", "#f0", 0x10("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ "gssqc1 "#f2", "#f0", 0x10("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t"
+
+#define mov_line_end32x4_mmi(r0, r1, f0, f2) \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ "gssqc1 "#f2", "#f0", 0x10("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ "gssqc1 "#f2", "#f0", 0x10("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ "gssqc1 "#f2", "#f0", 0x10("#r0") \n\t" \
+ PTR_ADDU ""#r0", "#r0", "#r1" \n\t" \
+ "gssqc1 "#f2", "#f0", 0x0("#r0") \n\t" \
+ "gssqc1 "#f2", "#f0", 0x10("#r0") \n\t"
+
+#define exp_cross_mmi_32 \
+ mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_32x4_mmi(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_end32x4_mmi(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_32x4_mmi($11, %[iStride], $f16, $f18) \
+ mov_line_32x4_mmi($11, %[iStride], $f16, $f18) \
+ mov_line_32x4_mmi($11, %[iStride], $f16, $f18) \
+ mov_line_32x4_mmi($11, %[iStride], $f16, $f18) \
+ mov_line_32x4_mmi($11, %[iStride], $f16, $f18) \
+ mov_line_32x4_mmi($11, %[iStride], $f16, $f18) \
+ mov_line_32x4_mmi($11, %[iStride], $f16, $f18) \
+ mov_line_end32x4_mmi($11, %[iStride], $f16, $f18) \
+ mov_line_32x4_mmi($9, %[iStride], $f20, $f22) \
+ mov_line_32x4_mmi($9, %[iStride], $f20, $f22) \
+ mov_line_32x4_mmi($9, %[iStride], $f20, $f22) \
+ mov_line_32x4_mmi($9, %[iStride], $f20, $f22) \
+ mov_line_32x4_mmi($9, %[iStride], $f20, $f22) \
+ mov_line_32x4_mmi($9, %[iStride], $f20, $f22) \
+ mov_line_32x4_mmi($9, %[iStride], $f20, $f22) \
+ mov_line_end32x4_mmi($9, %[iStride], $f20, $f22) \
+ mov_line_32x4_mmi($8, %[iStride], $f24, $f26) \
+ mov_line_32x4_mmi($8, %[iStride], $f24, $f26) \
+ mov_line_32x4_mmi($8, %[iStride], $f24, $f26) \
+ mov_line_32x4_mmi($8, %[iStride], $f24, $f26) \
+ mov_line_32x4_mmi($8, %[iStride], $f24, $f26) \
+ mov_line_32x4_mmi($8, %[iStride], $f24, $f26) \
+ mov_line_32x4_mmi($8, %[iStride], $f24, $f26) \
+ mov_line_end32x4_mmi($8, %[iStride], $f24, $f26)
+
+#define exp_top_bottom_mmi_16_aligned \
+ "move $8, %[iWidth] \n\t" \
+ "dsra %[iWidth], %[iWidth], 0x4 \n\t" \
+ "1: \n\t" \
+ "gslqc1 $f2, $f0, 0x0(%[pDst]) \n\t" \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ mov_line_end16x4_mmi_aligned($9, %[iStride], $f0, $f2) \
+ "gslqc1 $f6, $f4, 0x0(%[iHeight]) \n\t" \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ mov_line_end16x4_mmi_aligned($11, %[iStride], $f4, $f6) \
+ PTR_ADDIU "%[pDst], %[pDst], 0x10 \n\t" \
+ PTR_ADDIU "$9, $9, 0x10 \n\t" \
+ PTR_ADDIU "%[iHeight], %[iHeight], 0x10 \n\t" \
+ PTR_ADDIU "$11, $11, 0x10 \n\t" \
+ "dnegu %[iStride], %[iStride] \n\t" \
+ PTR_ADDIU "%[iWidth], %[iWidth], -0x1 \n\t" \
+ "bnez %[iWidth], 1b \n\t" \
+ "nop \n\t" \
+ "and $8, 0x0F \n\t" \
+ "beqz $8, 2f \n\t" \
+ "nop \n\t" \
+ "gsldxc1 $f0, 0x0(%[pDst], $0) \n\t" \
+ mov_line_8x4_mmi_aligned($9, %[iStride], $f0) \
+ mov_line_8x4_mmi_aligned($9, %[iStride], $f0) \
+ mov_line_8x4_mmi_aligned($9, %[iStride], $f0) \
+ mov_line_end8x4_mmi_aligned($9, %[iStride], $f0) \
+ "gsldxc1 $f4, 0x0(%[iHeight], $0) \n\t" \
+ mov_line_8x4_mmi_aligned($11, %[iStride], $f4) \
+ mov_line_8x4_mmi_aligned($11, %[iStride], $f4) \
+ mov_line_8x4_mmi_aligned($11, %[iStride], $f4) \
+ mov_line_end8x4_mmi_aligned($11, %[iStride], $f4) \
+ "2: \n\t"
+
+#define exp_top_bottom_mmi_16_unaligned \
+ "move $8, %[iWidth] \n\t" \
+ "dsra %[iWidth], %[iWidth], 0x4 \n\t" \
+ "1: \n\t" \
+ "gsldlc1 $f0, 0x7(%[pDst]) \n\t" \
+ "gsldlc1 $f2, 0xF(%[pDst]) \n\t" \
+ "gsldrc1 $f0, 0x0(%[pDst]) \n\t" \
+ "gsldrc1 $f2, 0x8(%[pDst]) \n\t" \
+ mov_line_16x4_mmi_unaligned($9, %[iStride], $f0, $f2) \
+ mov_line_16x4_mmi_unaligned($9, %[iStride], $f0, $f2) \
+ mov_line_16x4_mmi_unaligned($9, %[iStride], $f0, $f2) \
+ mov_line_end16x4_mmi_unaligned($9, %[iStride], $f0, $f2) \
+ "gsldlc1 $f4, 0x7(%[iHeight]) \n\t" \
+ "gsldlc1 $f6, 0xF(%[iHeight]) \n\t" \
+ "gsldrc1 $f4, 0x0(%[iHeight]) \n\t" \
+ "gsldrc1 $f6, 0x8(%[iHeight]) \n\t" \
+ mov_line_16x4_mmi_unaligned($11, %[iStride], $f4, $f6) \
+ mov_line_16x4_mmi_unaligned($11, %[iStride], $f4, $f6) \
+ mov_line_16x4_mmi_unaligned($11, %[iStride], $f4, $f6) \
+ mov_line_end16x4_mmi_unaligned($11, %[iStride], $f4, $f6) \
+ PTR_ADDIU "%[pDst], %[pDst], 0x10 \n\t" \
+ PTR_ADDIU "$9, $9, 0x10 \n\t" \
+ PTR_ADDIU "%[iHeight], %[iHeight], 0x10 \n\t" \
+ PTR_ADDIU "$11, $11, 0x10 \n\t" \
+ "dnegu %[iStride], %[iStride] \n\t" \
+ PTR_ADDIU "%[iWidth], %[iWidth], -0x1 \n\t" \
+ "bnez %[iWidth], 1b \n\t" \
+ "nop \n\t" \
+ "and $8, 0x0F \n\t" \
+ "beqz $8, 2f \n\t" \
+ "nop \n\t" \
+ "gsldlc1 $f0, 0x7(%[pDst]) \n\t" \
+ "gsldrc1 $f0, 0x0(%[pDst]) \n\t" \
+ mov_line_8x4_mmi_unaligned($9, %[iStride], $f0) \
+ mov_line_8x4_mmi_unaligned($9, %[iStride], $f0) \
+ mov_line_8x4_mmi_unaligned($9, %[iStride], $f0) \
+ mov_line_end8x4_mmi_unaligned($9, %[iStride], $f0) \
+ "gsldlc1 $f4, 0x7(%[iHeight]) \n\t" \
+ "gsldrc1 $f4, 0x0(%[iHeight]) \n\t" \
+ mov_line_8x4_mmi_unaligned($11, %[iStride], $f4) \
+ mov_line_8x4_mmi_unaligned($11, %[iStride], $f4) \
+ mov_line_8x4_mmi_unaligned($11, %[iStride], $f4) \
+ mov_line_end8x4_mmi_unaligned($11, %[iStride], $f4) \
+ "2: \n\t"
+
+#define exp_left_right_mmi_16_aligned \
+ "3: \n\t" \
+ "lbu %[iWidth], 0x0(%[pDst]) \n\t" \
+ MMI_Copy16Times($f0, $f2, $f28, %[iWidth]) \
+ "gssqc1 $f2, $f0, 0x0($9) \n\t" \
+ "lbu %[iWidth], 0x0(%[iHeight]) \n\t" \
+ MMI_Copy16Times($f4, $f6, $f28, %[iWidth]) \
+ "gssqc1 $f6, $f4, 0x0($11) \n\t" \
+ PTR_ADDU "%[pDst], %[pDst], %[iStride] \n\t" \
+ PTR_ADDU "$9, $9, %[iStride] \n\t" \
+ PTR_ADDU "%[iHeight], %[iHeight], %[iStride] \n\t" \
+ PTR_ADDU "$11, $11, %[iStride] \n\t" \
+ PTR_ADDIU "$8, $8, -0x1 \n\t" \
+ "bnez $8, 3b \n\t" \
+ "nop \n\t"
+
+#define exp_left_right_mmi_16_unaligned \
+ "3: \n\t" \
+ "lbu %[iWidth], 0x0(%[pDst]) \n\t" \
+ MMI_Copy16Times($f0, $f2, $f28, %[iWidth]) \
+ "gssdlc1 $f0, 0x7($9) \n\t" \
+ "gssdlc1 $f2, 0xF($9) \n\t" \
+ "gssdrc1 $f0, 0x0($9) \n\t" \
+ "gssdrc1 $f2, 0x8($9) \n\t" \
+ "lbu %[iWidth], 0x0(%[iHeight]) \n\t" \
+ MMI_Copy16Times($f4, $f6, $f28, %[iWidth]) \
+ "gssdlc1 $f4, 0x7($11) \n\t" \
+ "gssdlc1 $f6, 0xF($11) \n\t" \
+ "gssdrc1 $f4, 0x0($11) \n\t" \
+ "gssdrc1 $f6, 0x8($11) \n\t" \
+ PTR_ADDU "%[pDst], %[pDst], %[iStride] \n\t" \
+ PTR_ADDU "$9, $9, %[iStride] \n\t" \
+ PTR_ADDU "%[iHeight], %[iHeight], %[iStride] \n\t" \
+ PTR_ADDU "$11, $11, %[iStride] \n\t" \
+ PTR_ADDIU "$8, $8, -0x1 \n\t" \
+ "bnez $8, 3b \n\t" \
+ "nop \n\t"
+
+#define exp_cross_mmi_16_aligned \
+ mov_line_16x4_mmi_aligned(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_16x4_mmi_aligned(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_16x4_mmi_aligned(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_end16x4_mmi_aligned(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f16, $f18) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f16, $f18) \
+ mov_line_16x4_mmi_aligned($11, %[iStride], $f16, $f18) \
+ mov_line_end16x4_mmi_aligned($11, %[iStride], $f16, $f18) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f20, $f22) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f20, $f22) \
+ mov_line_16x4_mmi_aligned($9, %[iStride], $f20, $f22) \
+ mov_line_end16x4_mmi_aligned($9, %[iStride], $f20, $f22) \
+ mov_line_16x4_mmi_aligned($8, %[iStride], $f24, $f26) \
+ mov_line_16x4_mmi_aligned($8, %[iStride], $f24, $f26) \
+ mov_line_16x4_mmi_aligned($8, %[iStride], $f24, $f26) \
+ mov_line_end16x4_mmi_aligned($8, %[iStride], $f24, $f26)
+
+#define exp_cross_mmi_16_unaligned \
+ mov_line_16x4_mmi_unaligned(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_16x4_mmi_unaligned(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_16x4_mmi_unaligned(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_end16x4_mmi_unaligned(%[iHeight], %[iStride], $f12, $f14) \
+ mov_line_16x4_mmi_unaligned($11, %[iStride], $f16, $f18) \
+ mov_line_16x4_mmi_unaligned($11, %[iStride], $f16, $f18) \
+ mov_line_16x4_mmi_unaligned($11, %[iStride], $f16, $f18) \
+ mov_line_end16x4_mmi_unaligned($11, %[iStride], $f16, $f18) \
+ mov_line_16x4_mmi_unaligned($9, %[iStride], $f20, $f22) \
+ mov_line_16x4_mmi_unaligned($9, %[iStride], $f20, $f22) \
+ mov_line_16x4_mmi_unaligned($9, %[iStride], $f20, $f22) \
+ mov_line_end16x4_mmi_unaligned($9, %[iStride], $f20, $f22) \
+ mov_line_16x4_mmi_unaligned($8, %[iStride], $f24, $f26) \
+ mov_line_16x4_mmi_unaligned($8, %[iStride], $f24, $f26) \
+ mov_line_16x4_mmi_unaligned($8, %[iStride], $f24, $f26) \
+ mov_line_end16x4_mmi_unaligned($8, %[iStride], $f24, $f26)
+
+void ExpandPictureLuma_mmi(uint8_t *pDst, int32_t iStride, int32_t iWidth,
+ int32_t iHeight) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "lbu $8, 0x0(%[pDst]) \n\t"
+
+ MMI_Copy16Times($f12, $f14, $f28, $8)
+
+ "dnegu %[iStride], %[iStride] \n\t"
+ PTR_ADDU "$9, %[pDst], %[iStride] \n\t"
+ "dnegu %[iStride], %[iStride] \n\t"
+ "move $10, %[iHeight] \n\t"
+ PTR_ADDU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "dmul %[iHeight], %[iHeight], %[iStride] \n\t"
+ PTR_ADDU "%[iHeight], %[iHeight], %[pDst] \n\t"
+
+ "move $8, %[iStride] \n\t"
+ "dsll $8, 0x5 \n\t"
+ PTR_ADDU "$11, %[iHeight], $8 \n\t"
+
+ "lbu $8, 0x0(%[iHeight]) \n\t"
+ MMI_Copy16Times($f20, $f22, $f28, $8)
+ PTR_ADDU "$8, %[iHeight], %[iWidth] \n\t"
+ PTR_ADDIU "$8, -0x1 \n\t"
+ "lbu $8, 0x0($8) \n\t"
+ "dmtc1 $8, $f24 \n\t"
+ "pshufh $f24, $f24, $f28 \n\t"
+ "packushb $f24, $f24, $f24 \n\t"
+ "mov.d $f26, $f24 \n\t"
+ "dnegu %[iStride], %[iStride] \n\t"
+ "move $12, %[pDst] \n\t"
+ "move $13, %[iStride] \n\t"
+ "move $14, %[iWidth] \n\t"
+ exp_top_bottom_mmi_32
+ "move %[iWidth], $14 \n\t"
+ "move %[iStride], $13 \n\t"
+ "move %[pDst], $12 \n\t"
+ PTR_ADDIU "$9, %[pDst], -0x20 \n\t"
+ PTR_ADDU "%[iHeight], %[pDst], %[iWidth] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ PTR_ADDIU "$11, %[iHeight], 0x1 \n\t"
+ "lbu $8, 0x0(%[iHeight]) \n\t"
+ MMI_Copy16Times($f16, $f18, $f28, $8)
+ "dnegu %[iStride], %[iStride] \n\t"
+ "move $8, $10 \n\t"
+ "move $10, %[pDst] \n\t"
+ "move $12, %[iStride] \n\t"
+ "move $13, %[iWidth] \n\t"
+ "move $14, $8 \n\t"
+
+ exp_left_right_mmi_32
+
+ "move $8, $14 \n\t"
+ "move %[iWidth], $13 \n\t"
+ "move %[iStride], $12 \n\t"
+ "move %[pDst], $10 \n\t"
+ "dnegu %[iStride], %[iStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[pDst], -0x20 \n\t"
+ PTR_ADDU "%[iHeight], %[iHeight], %[iStride] \n\t"
+ PTR_ADDU "$11, %[pDst], %[iWidth] \n\t"
+ PTR_ADDU "$11, $11, %[iStride] \n\t"
+ "dnegu %[iStride], %[iStride] \n\t"
+ PTR_ADDIU "$8, $8, 0x20 \n\t"
+ "dmul $8, $8, %[iStride] \n\t"
+ PTR_ADDU "$9, %[iHeight], $8 \n\t"
+ PTR_ADDU "$8, $11, $8 \n\t"
+ "dnegu %[iStride], %[iStride] \n\t"
+ exp_cross_mmi_32
+ : [pDst]"+&r"((unsigned char *)pDst), [iStride]"+&r"((int)iStride),
+ [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
+ :
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
+ "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+ "$f22", "$f24", "$f26", "$f28"
+ );
+ RECOVER_REG;
+}
+
+void ExpandPictureChromaUnalign_mmi(uint8_t *pDst, int32_t iStride, int32_t iWidth,
+ int32_t iHeight) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "lbu $8, 0x0(%[pDst]) \n\t"
+
+ MMI_Copy16Times($f12, $f14, $f28, $8)
+
+ "dnegu %[iStride], %[iStride] \n\t"
+ PTR_ADDU "$9, %[pDst], %[iStride] \n\t"
+ "dnegu %[iStride], %[iStride] \n\t"
+ "move $10, %[iHeight] \n\t"
+ PTR_ADDU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "dmul %[iHeight], %[iHeight], %[iStride] \n\t"
+ PTR_ADDU "%[iHeight], %[iHeight], %[pDst] \n\t"
+ "move $8, %[iStride] \n\t"
+ "dsll $8, 0x4 \n\t"
+ PTR_ADDU "$11, %[iHeight], $8 \n\t"
+ "lbu $8, 0x0(%[iHeight]) \n\t"
+
+ MMI_Copy16Times($f20, $f22, $f28, $8)
+
+ PTR_ADDU "$8, %[iHeight], %[iWidth] \n\t"
+ PTR_ADDIU "$8, -0x1 \n\t"
+ "lbu $8, 0x0($8) \n\t"
+
+ MMI_Copy16Times($f24, $f26, $f28, $8)
+
+ "dnegu %[iStride], %[iStride] \n\t"
+ "move $12, %[pDst] \n\t"
+ "move $13, %[iStride] \n\t"
+ "move $14, %[iWidth] \n\t"
+
+ exp_top_bottom_mmi_16_unaligned
+
+ "move %[iWidth], $14 \n\t"
+ "move %[iStride], $13 \n\t"
+ "move %[pDst], $12 \n\t"
+ PTR_ADDIU "$9, %[pDst], -0x10 \n\t"
+ PTR_ADDU "%[iHeight], %[pDst], %[iWidth] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ PTR_ADDIU "$11, %[iHeight], 0x1 \n\t"
+ "lbu $8, 0x0(%[iHeight]) \n\t"
+ MMI_Copy16Times($f16, $f18, $f28, $8)
+
+ "dnegu %[iStride], %[iStride] \n\t"
+ "move $8, $10 \n\t"
+
+ "move $10, %[pDst] \n\t"
+ "move $12, %[iStride] \n\t"
+ "move $13, %[iWidth] \n\t"
+ "move $14, $8 \n\t"
+
+ exp_left_right_mmi_16_unaligned
+
+ "move $8, $14 \n\t"
+ "move %[iWidth], $13 \n\t"
+ "move %[iStride], $12 \n\t"
+ "move %[pDst], $10 \n\t"
+
+ "dnegu %[iStride], %[iStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[pDst], -0x10 \n\t"
+ PTR_ADDU "%[iHeight], %[iHeight], %[iStride] \n\t"
+ PTR_ADDU "$11, %[pDst], %[iWidth] \n\t"
+ PTR_ADDU "$11, $11, %[iStride] \n\t"
+
+ "dnegu %[iStride], %[iStride] \n\t"
+ PTR_ADDIU "$8, $8, 0x10 \n\t"
+ "dmul $8, $8, %[iStride] \n\t"
+
+ PTR_ADDU "$9, %[iHeight], $8 \n\t"
+ PTR_ADDU "$8, $11, $8 \n\t"
+ "dnegu %[iStride], %[iStride] \n\t"
+
+ exp_cross_mmi_16_unaligned
+ : [pDst]"+&r"((unsigned char *)pDst), [iStride]"+&r"((int)iStride),
+ [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
+ :
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
+ "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+ "$f22", "$f24", "$f26", "$f28"
+ );
+ RECOVER_REG;
+}
+
+void ExpandPictureChromaAlign_mmi(uint8_t *pDst, int32_t iStride, int32_t iWidth,
+ int32_t iHeight) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "lbu $8, 0x0(%[pDst]) \n\t"
+
+ MMI_Copy16Times($f12, $f14, $f28, $8)
+
+ "dnegu %[iStride], %[iStride] \n\t"
+ PTR_ADDU "$9, %[pDst], %[iStride] \n\t"
+ "dnegu %[iStride], %[iStride] \n\t"
+ "move $10, %[iHeight] \n\t"
+ PTR_ADDU "%[iHeight], %[iHeight], -0x1 \n\t"
+ "dmul %[iHeight], %[iHeight], %[iStride] \n\t"
+ PTR_ADDU "%[iHeight], %[iHeight], %[pDst] \n\t"
+ "move $8, %[iStride] \n\t"
+ "dsll $8, 0x4 \n\t"
+ PTR_ADDU "$11, %[iHeight], $8 \n\t"
+ "lbu $8, 0x0(%[iHeight]) \n\t"
+
+ MMI_Copy16Times($f20, $f22, $f28, $8)
+
+ PTR_ADDU "$8, %[iHeight], %[iWidth] \n\t"
+ PTR_ADDIU "$8, -0x1 \n\t"
+ "lbu $8, 0x0($8) \n\t"
+
+ MMI_Copy16Times($f24, $f26, $f28, $8)
+
+ "dnegu %[iStride], %[iStride] \n\t"
+
+ "move $12, %[pDst] \n\t"
+ "move $13, %[iStride] \n\t"
+ "move $14, %[iWidth] \n\t"
+ exp_top_bottom_mmi_16_aligned
+
+ "move %[iWidth], $14 \n\t"
+ "move %[iStride], $13 \n\t"
+ "move %[pDst], $12 \n\t"
+
+ PTR_ADDIU "$9, %[pDst], -0x10 \n\t"
+
+ PTR_ADDU "%[iHeight], %[pDst], %[iWidth] \n\t"
+ PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t"
+ PTR_ADDIU "$11, %[iHeight], 0x1 \n\t"
+
+ "lbu $8, 0x0(%[iHeight]) \n\t"
+
+ MMI_Copy16Times($f16, $f18, $f28, $8)
+
+ "dnegu %[iStride], %[iStride] \n\t"
+ "move $8, $10 \n\t"
+
+ "move $10, %[pDst] \n\t"
+ "move $12, %[iStride] \n\t"
+ "move $13, %[iWidth] \n\t"
+ "move $14, $8 \n\t"
+
+ exp_left_right_mmi_16_aligned
+
+ "move $8, $14 \n\t"
+ "move %[iWidth], $13 \n\t"
+ "move %[iStride], $12 \n\t"
+ "move %[pDst], $10 \n\t"
+
+ "dnegu %[iStride], %[iStride] \n\t"
+ PTR_ADDIU "%[iHeight], %[pDst], -0x10 \n\t"
+ PTR_ADDU "%[iHeight], %[iHeight], %[iStride] \n\t"
+ PTR_ADDU "$11, %[pDst], %[iWidth] \n\t"
+ PTR_ADDU "$11, $11, %[iStride] \n\t"
+
+ "dnegu %[iStride], %[iStride] \n\t"
+ PTR_ADDIU "$8, $8, 0x10 \n\t"
+ "dmul $8, $8, %[iStride] \n\t"
+
+ PTR_ADDU "$9, %[iHeight], $8 \n\t"
+ PTR_ADDU "$8, $11, $8 \n\t"
+ "dnegu %[iStride], %[iStride] \n\t"
+
+ exp_cross_mmi_16_aligned
+ : [pDst]"+&r"((unsigned char *)pDst), [iStride]"+&r"((int)iStride),
+ [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight)
+ :
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2",
+ "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20",
+ "$f22", "$f24", "$f26", "$f28"
+ );
+ RECOVER_REG;
+}
--- /dev/null
+++ b/codec/common/mips/intra_pred_com_mmi.c
@@ -1,0 +1,548 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2018, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file intra_pred_com_mmi.c
+ *
+ * \brief Loongson optimization
+ *
+ * \date 23/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+#define MMI_PRED_H_16X16_ONE_LINE \
+ PTR_ADDIU "%[pPred], %[pPred], 0x10 \n\t" \
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t" \
+ "lbu $8, 0x0(%[pRef]) \n\t" \
+ MMI_Copy16Times($f0, $f2, $f4, $8) \
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+
+#define LOAD_2_LEFT_AND_ADD \
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t" \
+ "lbu $9, -0x1(%[pRef]) \n\t" \
+ PTR_ADDU "$8, $8, $9 \n\t" \
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t" \
+ "lbu $9, -0x1(%[pRef]) \n\t" \
+ PTR_ADDU "$8, $8, $9 \n\t"
+
+//f2 should be mmi_01bytes, f4 should be 0x38, f6 should be 0x0
+#define MMI_PRED_H_8X8_ONE_LINE(f0, f2, f4, f6, r0, r1, r1_offset) \
+ PTR_ADDU ""#r0", "#r0", %[kiStride] \n\t" \
+ "gsldxc1 "#f0", -0x8("#r0", $0) \n\t" \
+ "dsrl "#f0", "#f0", "#f4" \n\t" \
+ "pmullh "#f0", "#f0", "#f2" \n\t" \
+ "pshufh "#f0", "#f0", "#f6" \n\t" \
+ "gssdxc1 "#f0", "#r1_offset"+0x0("#r1", $0) \n\t"
+
+void WelsI16x16LumaPredV_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_SUBU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pRef]) \n\t"
+
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x10(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x20(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x30(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x40(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x50(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x60(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x70(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x80(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x90(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xa0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xb0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xc0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xd0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xe0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xf0(%[pPred]) \n\t"
+ : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+ : [kiStride]"r"((int)kiStride)
+ : "memory", "$f0", "$f2"
+ );
+}
+
+void WelsI16x16LumaPredH_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_ADDIU "%[pRef], %[pRef], -0x1 \n\t"
+ "lbu $8, 0x0(%[pRef]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ MMI_Copy16Times($f0, $f2, $f4, $8)
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ MMI_PRED_H_16X16_ONE_LINE
+ : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+ : [kiStride]"r"((int)kiStride)
+ : "memory", "$8", "$f0", "$f2", "$f4"
+ );
+}
+
+void WelsI16x16LumaPredDc_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+ unsigned char mmi_01bytes[16]__attribute__((aligned(16))) =
+ {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_SUBU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pRef]) \n\t"
+ "xor $f4, $f4, $f4 \n\t"
+ "pasubub $f0, $f0, $f4 \n\t"
+ "pasubub $f2, $f2, $f4 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f0, $f0, $f2 \n\t"
+
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "lbu $8, -0x1(%[pRef]) \n\t"
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pRef]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+
+ "dli $10, 0x5 \n\t"
+ "dmtc1 $10, $f6 \n\t"
+ PTR_ADDIU "$8, 0x10 \n\t"
+ "dmtc1 $8, $f4 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "psrlw $f0, $f0, $f6 \n\t"
+ "gsldxc1 $f6, 0x0(%[mmi_01bytes], $0) \n\t"
+ "pmuluw $f0, $f0, $f6 \n\t"
+ "punpcklwd $f0, $f0, $f0 \n\t"
+ "mov.d $f2, $f0 \n\t"
+
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x10(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x20(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x30(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x40(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x50(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x60(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x70(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x80(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x90(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xa0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xb0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xc0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xd0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xe0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0xf0(%[pPred]) \n\t"
+ : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+ : [kiStride]"r"((int)kiStride), [mmi_01bytes]"r"((unsigned char *)mmi_01bytes)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
+ );
+}
+
+void WelsI16x16LumaPredPlane_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+ short mmi_plane_inc_minus[8]__attribute__((aligned(16))) = {-7, -6, -5, -4,
+ -3, -2, -1, 0};
+ short mmi_plane_inc[8]__attribute__((aligned(16))) = {1, 2, 3, 4, 5, 6, 7, 8};
+ short mmi_plane_dec[8]__attribute__((aligned(16))) = {8, 7, 6, 5, 4, 3, 2, 1};
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_ADDIU "%[pRef], %[pRef], -0x1 \n\t"
+ PTR_SUBU "%[pRef], %[pRef], %[kiStride] \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pRef]) \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "gsldrc1 $f0, 0x0(%[pRef]) \n\t"
+ "gslqc1 $f22, $f20, 0x0(%[mmi_plane_dec]) \n\t"
+ "punpckhbh $f2, $f0, $f28 \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "gsldlc1 $f4, 0x10(%[pRef]) \n\t"
+ "pmullh $f0, $f0, $f20 \n\t"
+ "pmullh $f2, $f2, $f22 \n\t"
+ "gsldrc1 $f4, 0x9(%[pRef]) \n\t"
+ "gslqc1 $f26, $f24, 0x0(%[mmi_plane_inc]) \n\t"
+ "punpckhbh $f6, $f4, $f28 \n\t"
+ "punpcklbh $f4, $f4, $f28 \n\t"
+ "pmullh $f4, $f4, $f24 \n\t"
+ "pmullh $f6, $f6, $f26 \n\t"
+ "psubh $f4, $f4, $f0 \n\t"
+ "psubh $f6, $f6, $f2 \n\t"
+
+ "xor $f8, $f8, $f8 \n\t"
+ SUMH_HORIZON($f4, $f6, $f0, $f2, $f8)
+ "dmfc1 $8, $f4 \n\t"
+ "seh $8, $8 \n\t"
+ "mul $8, $8, 0x5 \n\t"
+ PTR_ADDIU "$8, $8, 0x20 \n\t"
+ "sra $8, $8, 0x6 \n\t"
+ MMI_Copy8Times($f4, $f6, $f28, $8)
+
+ "lbu $9, 0x10(%[pRef]) \n\t"
+ PTR_ADDIU "%[pRef], %[pRef], -0x3 \n\t"
+ LOAD_COLUMN($f0, $f2, $f8, $f10, $f12, $f14, $f16,
+ $f18, %[pRef], %[kiStride], $11)
+
+ PTR_ADDIU "%[pRef], %[pRef], 0x3 \n\t"
+ "dsll $10, %[kiStride], 0x3 \n\t"
+ PTR_ADDU "$10, $10, %[pRef] \n\t"
+ "lbu $8, 0x0($10) \n\t"
+ PTR_ADDU "$9, $9, $8 \n\t"
+ "dsll $9, $9, 0x4 \n\t"
+
+ PTR_ADDIU "%[pRef], %[pRef], -0x3 \n\t"
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ LOAD_COLUMN($f28, $f30, $f8, $f10, $f12, $f14, $f16,
+ $f18, %[pRef], %[kiStride], $11)
+ "xor $f16, $f16, $f16 \n\t"
+ "xor $f18, $f18, $f18 \n\t"
+ "punpcklbh $f0, $f2, $f18 \n\t"
+ "punpckhbh $f2, $f2, $f18 \n\t"
+ "pmullh $f0, $f0, $f20 \n\t"
+ "pmullh $f2, $f2, $f22 \n\t"
+ "punpcklbh $f28, $f30, $f18 \n\t"
+ "punpckhbh $f30, $f30, $f18 \n\t"
+ "pmullh $f28, $f28, $f24 \n\t"
+ "pmullh $f30, $f30, $f26 \n\t"
+ "psubh $f28, $f28, $f0 \n\t"
+ "psubh $f30, $f30, $f2 \n\t"
+
+ SUMH_HORIZON($f28, $f30, $f0, $f2, $f8)
+ "dmfc1 $8, $f28 \n\t"
+ "seh $8, $8 \n\t"
+ "mul $8, $8, 0x5 \n\t"
+ PTR_ADDIU "$8, $8, 0x20 \n\t"
+ "sra $8, $8, 0x6 \n\t"
+ "xor $f20, $f20, $f20 \n\t"
+ MMI_Copy8Times($f16, $f18, $f20, $8)
+
+ PTR_ADDIU "$9, $9, 0x10 \n\t"
+ "mul $8, $8, -0x7 \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ "xor $f20, $f20, $f20 \n\t"
+ MMI_Copy8Times($f0, $f2, $f20, $8)
+
+ "xor $8, $8, $8 \n\t"
+ "gslqc1 $f22, $f20, 0x0(%[mmi_plane_inc_minus]) \n\t"
+
+ "dli $10, 0x5 \n\t"
+ "dmtc1 $10, $f30 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ "1: \n\t"
+ "pmullh $f8, $f4, $f20 \n\t"
+ "pmullh $f10, $f6, $f22 \n\t"
+ "paddh $f8, $f8, $f0 \n\t"
+ "paddh $f10, $f10, $f2 \n\t"
+ "psrah $f8, $f8, $f30 \n\t"
+ "psrah $f10, $f10, $f30 \n\t"
+ "pmullh $f12, $f4, $f24 \n\t"
+ "pmullh $f14, $f6, $f26 \n\t"
+ "paddh $f12, $f12, $f0 \n\t"
+ "paddh $f14, $f14, $f2 \n\t"
+ "psrah $f12, $f12, $f30 \n\t"
+ "psrah $f14, $f14, $f30 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "packushb $f10, $f12, $f14 \n\t"
+ "gssqc1 $f10, $f8, 0x0(%[pPred]) \n\t"
+ "paddh $f0, $f0, $f16 \n\t"
+ "paddh $f2, $f2, $f18 \n\t"
+ PTR_ADDIU "%[pPred], %[pPred], 0x10 \n\t"
+ PTR_ADDIU "$8, $8, 0x1 \n\t"
+ PTR_ADDIU "$10, $8, -0x10 \n\t"
+ "bnez $10, 1b \n\t"
+ "nop \n\t"
+ : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+ : [kiStride]"r"((int)kiStride), [mmi_plane_inc_minus]"r"(mmi_plane_inc_minus),
+ [mmi_plane_inc]"r"(mmi_plane_inc), [mmi_plane_dec]"r"(mmi_plane_dec)
+ : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8",
+ "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26",
+ "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsIChromaPredPlane_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+ short mmi_plane_inc_c[4]__attribute__((aligned(16))) = {1, 2, 3, 4};
+ short mmi_plane_dec_c[4]__attribute__((aligned(16))) = {4, 3, 2, 1};
+ short mmi_plane_mul_b_c[8]__attribute__((aligned(16))) = {-3, -2, -1, 0,
+ 1, 2, 3, 4};
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_ADDIU "%[pRef], %[pRef], -0x1 \n\t"
+ PTR_SUBU "%[pRef], %[pRef], %[kiStride] \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pRef]) \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "gsldrc1 $f0, 0x0(%[pRef]) \n\t"
+ "gsldxc1 $f20, 0x0(%[mmi_plane_dec_c], $0) \n\t"
+ "punpcklbh $f0, $f0, $f28 \n\t"
+ "gsldlc1 $f4, 0xc(%[pRef]) \n\t"
+ "pmullh $f0, $f0, $f20 \n\t"
+ "gsldrc1 $f4, 0x5(%[pRef]) \n\t"
+ "gsldxc1 $f24, 0x0(%[mmi_plane_inc_c], $0) \n\t"
+ "punpcklbh $f4, $f4, $f28 \n\t"
+ "pmullh $f4, $f4, $f24 \n\t"
+ "psubh $f4, $f4, $f0 \n\t"
+
+ "xor $f6, $f6, $f6 \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+ SUMH_HORIZON($f4, $f6, $f0, $f2, $f8)
+ "dmfc1 $8, $f4 \n\t"
+ "seh $8, $8 \n\t"
+ "mul $8, $8, 0x11 \n\t"
+ PTR_ADDIU "$8, $8, 0x10 \n\t"
+ "sra $8, $8, 0x5 \n\t"
+ MMI_Copy8Times($f4, $f6, $f28, $8)
+
+ "lbu $8, 0x8(%[pRef]) \n\t"
+ PTR_ADDIU "%[pRef], %[pRef], -0x3 \n\t"
+ LOAD_COLUMN_C($f0, $f8, $f12, $f16, %[pRef], %[kiStride], $10)
+
+ PTR_ADDIU "%[pRef], %[pRef], 0x3 \n\t"
+ "dsll $10, %[kiStride], 0x2 \n\t"
+ PTR_ADDU "$10, $10, %[pRef] \n\t"
+ "lbu $9, 0x0($10) \n\t"
+ PTR_ADDU "$9, $9, $8 \n\t"
+ "dsll $9, $9, 0x4 \n\t"
+
+ PTR_ADDIU "%[pRef], %[pRef], -0x3 \n\t"
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ LOAD_COLUMN_C($f28, $f8, $f12, $f16, %[pRef], %[kiStride], $10)
+ "xor $f16, $f16, $f16 \n\t"
+ "punpckhbh $f0, $f0, $f16 \n\t"
+ "pmullh $f0, $f0, $f20 \n\t"
+ "punpckhbh $f28, $f28, $f16 \n\t"
+ "pmullh $f28, $f28, $f24 \n\t"
+ "psubh $f28, $f28, $f0 \n\t"
+
+ "xor $f30, $f30, $f30 \n\t"
+ "xor $f8, $f8, $f8 \n\t"
+ SUMH_HORIZON($f28, $f30, $f0, $f2, $f8)
+ "dmfc1 $8, $f28 \n\t"
+ "seh $8, $8 \n\t"
+ "mul $8, $8, 0x11 \n\t"
+ PTR_ADDIU "$8, $8, 0x10 \n\t"
+ "sra $8, $8, 0x5 \n\t"
+ MMI_Copy8Times($f16, $f18, $f8, $8)
+
+ PTR_ADDIU "$9, $9, 0x10 \n\t"
+ "mul $8, $8, -0x3 \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ MMI_Copy8Times($f0, $f2, $f8, $8)
+
+ "xor $8, $8, $8 \n\t"
+ "gslqc1 $f22, $f20, 0x0(%[mmi_plane_mul_b_c]) \n\t"
+
+ "dli $10, 0x5 \n\t"
+ "dmtc1 $10, $f30 \n\t"
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+
+ "1: \n\t"
+ "pmullh $f8, $f4, $f20 \n\t"
+ "pmullh $f10, $f6, $f22 \n\t"
+ "paddh $f8, $f8, $f0 \n\t"
+ "paddh $f10, $f10, $f2 \n\t"
+ "psrah $f8, $f8, $f30 \n\t"
+ "psrah $f10, $f10, $f30 \n\t"
+ "packushb $f8, $f8, $f10 \n\t"
+ "gssdxc1 $f8, 0x0(%[pPred], $0) \n\t"
+ "paddh $f0, $f0, $f16 \n\t"
+ "paddh $f2, $f2, $f18 \n\t"
+ PTR_ADDIU "%[pPred], %[pPred], 0x8 \n\t"
+ PTR_ADDIU "$8, $8, 0x1 \n\t"
+ PTR_ADDIU "$10, $8, -0x8 \n\t"
+ "bnez $10, 1b \n\t"
+ "nop \n\t"
+ : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+ : [kiStride]"r"((int)kiStride), [mmi_plane_mul_b_c]"r"(mmi_plane_mul_b_c),
+ [mmi_plane_inc_c]"r"(mmi_plane_inc_c), [mmi_plane_dec_c]"r"(mmi_plane_dec_c)
+ : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsIChromaPredV_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_SUBU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "gsldxc1 $f0, 0x0(%[pRef], $0) \n\t"
+ "mov.d $f2, $f0 \n\t"
+
+ "gssqc1 $f2, $f0, 0x0(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x10(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x20(%[pPred]) \n\t"
+ "gssqc1 $f2, $f0, 0x30(%[pPred]) \n\t"
+ : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+ : [kiStride]"r"((int)kiStride)
+ : "memory", "$f0", "$f2"
+ );
+}
+
+void WelsIChromaPredDc_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+ short mmi_0x02[4]__attribute__((aligned(16))) = {2, 0, 0, 0};
+ unsigned char mmi_01bytes[16]__attribute__((aligned(16))) =
+ {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_SUBU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "gsldxc1 $f0, 0x0(%[pRef], $0) \n\t"
+
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "lbu $8, -0x1(%[pRef]) \n\t"
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pRef]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pRef]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pRef]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ "dmtc1 $8, $f2 \n\t"
+
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "lbu $8, -0x1(%[pRef]) \n\t"
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pRef]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pRef]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ PTR_ADDU "%[pRef], %[pRef], %[kiStride] \n\t"
+ "lbu $9, -0x1(%[pRef]) \n\t"
+ PTR_ADDU "$8, $8, $9 \n\t"
+ "dmtc1 $8, $f4 \n\t"
+
+ "xor $f8, $f8, $f8 \n\t"
+ "punpcklwd $f6, $f0, $f8 \n\t"
+ "punpckhwd $f0, $f0, $f8 \n\t"
+ "pasubub $f0, $f0, $f8 \n\t"
+ "pasubub $f6, $f6, $f8 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f6, $f6 \n\t"
+
+ "dadd $f6, $f6, $f2 \n\t"
+ "dadd $f2, $f4, $f0 \n\t"
+
+ "gsldxc1 $f8, 0x0(%[mmi_0x02], $0) \n\t"
+
+ "dli $10, 0x2 \n\t"
+ "dmtc1 $10, $f10 \n\t"
+ "dadd $f0, $f0, $f8 \n\t"
+ "dsrl $f0, $f0, $f10 \n\t"
+
+ "dadd $f4, $f4, $f8 \n\t"
+ "dsrl $f4, $f4, $f10 \n\t"
+
+ "dli $10, 0x3 \n\t"
+ "dmtc1 $10, $f10 \n\t"
+ "dadd $f6, $f6, $f8 \n\t"
+ "dadd $f6, $f6, $f8 \n\t"
+ "dsrl $f6, $f6, $f10 \n\t"
+
+ "dadd $f2, $f2, $f8 \n\t"
+ "dadd $f2, $f2, $f8 \n\t"
+ "dsrl $f2, $f2, $f10 \n\t"
+
+ "dli $10, 0x20 \n\t"
+ "dmtc1 $10, $f10 \n\t"
+ "gsldxc1 $f12, 0x0(%[mmi_01bytes], $0) \n\t"
+ "pmuluw $f0, $f0, $f12 \n\t"
+ "pmuluw $f6, $f6, $f12 \n\t"
+ "dsll $f0, $f0, $f10 \n\t"
+ "xor $f0, $f0, $f6 \n\t"
+
+ "pmuluw $f4, $f4, $f12 \n\t"
+ "pmuluw $f2, $f2, $f12 \n\t"
+ "dsll $f2, $f2, $f10 \n\t"
+ "xor $f2, $f2, $f4 \n\t"
+
+ "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
+ "gssdxc1 $f0, 0x8(%[pPred], $0) \n\t"
+ "gssdxc1 $f0, 0x10(%[pPred], $0) \n\t"
+ "gssdxc1 $f0, 0x18(%[pPred], $0) \n\t"
+
+ "gssdxc1 $f2, 0x20(%[pPred], $0) \n\t"
+ "gssdxc1 $f2, 0x28(%[pPred], $0) \n\t"
+ "gssdxc1 $f2, 0x30(%[pPred], $0) \n\t"
+ "gssdxc1 $f2, 0x38(%[pPred], $0) \n\t"
+ : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+ : [kiStride]"r"((int)kiStride), [mmi_01bytes]"r"((unsigned char *)mmi_01bytes),
+ [mmi_0x02]"r"((unsigned char *)mmi_0x02)
+ : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12"
+ );
+}
+
+void WelsIChromaPredH_mmi(uint8_t *pPred, uint8_t *pRef, int32_t kiStride) {
+ unsigned char mmi_01bytes[16]__attribute__((aligned(16))) =
+ {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "gsldxc1 $f2, 0x0(%[mmi_01bytes], $0) \n\t"
+ "dli $8, 0x38 \n\t"
+ "dmtc1 $8, $f4 \n\t"
+ "xor $f6, $f6, $f6 \n\t"
+ "gsldxc1 $f0, -0x8(%[pRef], $0) \n\t"
+ "dsrl $f0, $f0, $f4 \n\t"
+
+ "pmullh $f0, $f0, $f2 \n\t"
+ "pshufh $f0, $f0, $f6 \n\t"
+ "gssdxc1 $f0, 0x0(%[pPred], $0) \n\t"
+
+ MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x8)
+ MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x10)
+ MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x18)
+ MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x20)
+ MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x28)
+ MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x30)
+ MMI_PRED_H_8X8_ONE_LINE($f0, $f2, $f4, $f6, %[pRef], %[pPred], 0x38)
+ : [pPred]"+&r"((unsigned char *)pPred), [pRef]"+&r"((unsigned char *)pRef)
+ : [kiStride]"r"((int)kiStride), [mmi_01bytes]"r"((unsigned char *)mmi_01bytes)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6"
+ );
+}
--- /dev/null
+++ b/codec/common/mips/satd_sad_mmi.c
@@ -1,0 +1,2154 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2018, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file satd_sad_mmi.c
+ *
+ * \brief Loongson optimization
+ *
+ * \date 23/07/2018 Created
+ *
+ *************************************************************************************
+ */
+#include <stdint.h>
+#include "asmdefs_mmi.h"
+
+#define MMI_SumWHorizon1(f0, f2, f4, f6, f8, f10, r0) \
+ "dli "#r0", 0x10 \n\t" \
+ "dmtc1 "#r0", "#f8" \n\t" \
+ "dli "#r0", 0x20 \n\t" \
+ "dmtc1 "#r0", "#f10" \n\t" \
+ "mov.d "#f4", "#f2" \n\t" \
+ "xor "#f6", "#f6", "#f6" \n\t" \
+ "paddush "#f0", "#f0", "#f4" \n\t" \
+ "paddush "#f2", "#f2", "#f6" \n\t" \
+ "dsrl "#f6", "#f2", "#f10" \n\t" \
+ "punpcklwd "#f4", "#f2", "#f2" \n\t" \
+ "punpckhwd "#f4", "#f0", "#f4" \n\t" \
+ "paddush "#f0", "#f0", "#f4" \n\t" \
+ "paddush "#f2", "#f2", "#f6" \n\t" \
+ "dsrl "#f4", "#f0", "#f8" \n\t" \
+ "pinsrh_3 "#f4", "#f4", "#f2" \n\t" \
+ "dsrl "#f6", "#f2", "#f8" \n\t" \
+ "paddush "#f0", "#f0", "#f4" \n\t" \
+ "paddush "#f2", "#f2", "#f6" \n\t"
+
+#define MMI_GetSad8x4 \
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t" \
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t" \
+ "gsldlc1 $f4, 0x7($8) \n\t" \
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t" \
+ "gsldrc1 $f4, 0x0($8) \n\t" \
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t" \
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t" \
+ "gsldlc1 $f2, 0x7(%[pSample1]) \n\t" \
+ "gsldlc1 $f6, 0x7($8) \n\t" \
+ "gsldlc1 $f8, 0x7(%[pSample2]) \n\t" \
+ "gsldrc1 $f2, 0x0(%[pSample1]) \n\t" \
+ "gsldrc1 $f6, 0x0($8) \n\t" \
+ "gsldrc1 $f8, 0x0(%[pSample2]) \n\t" \
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t" \
+ "gsldlc1 $f12, 0x7($9) \n\t" \
+ "gsldlc1 $f10, 0x7(%[pSample2]) \n\t" \
+ "gsldrc1 $f12, 0x0($9) \n\t" \
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t" \
+ "gsldrc1 $f10, 0x0(%[pSample2]) \n\t" \
+ "gsldlc1 $f14, 0x7($9) \n\t" \
+ "gsldrc1 $f14, 0x0($9) \n\t" \
+ "pasubub $f0, $f0, $f8 \n\t" \
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t" \
+ "pasubub $f2, $f2, $f10 \n\t" \
+ "biadd $f0, $f0 \n\t" \
+ "biadd $f2, $f2 \n\t" \
+ "pasubub $f4, $f4, $f12 \n\t" \
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t" \
+ "pasubub $f6, $f6, $f14 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f24, $f24, $f0 \n\t" \
+ "paddh $f26, $f26, $f2 \n\t" \
+ "paddh $f24, $f24, $f4 \n\t" \
+ "paddh $f26, $f26, $f6 \n\t"
+
+#define MMI_GetSad8x4_End \
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t" \
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t" \
+ "gsldlc1 $f4, 0x7($8) \n\t" \
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t" \
+ "gsldrc1 $f4, 0x0($8) \n\t" \
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t" \
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t" \
+ "gsldlc1 $f2, 0x7(%[pSample1]) \n\t" \
+ "gsldlc1 $f6, 0x7($8) \n\t" \
+ "gsldlc1 $f8, 0x7(%[pSample2]) \n\t" \
+ "gsldrc1 $f2, 0x0(%[pSample1]) \n\t" \
+ "gsldrc1 $f6, 0x0($8) \n\t" \
+ "gsldrc1 $f8, 0x0(%[pSample2]) \n\t" \
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t" \
+ "gsldlc1 $f12, 0x7($9) \n\t" \
+ "gsldlc1 $f10, 0x7(%[pSample2]) \n\t" \
+ "gsldrc1 $f12, 0x0($9) \n\t" \
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t" \
+ "gsldrc1 $f10, 0x0(%[pSample2]) \n\t" \
+ "gsldlc1 $f14, 0x7($9) \n\t" \
+ "gsldrc1 $f14, 0x0($9) \n\t" \
+ "pasubub $f0, $f0, $f8 \n\t" \
+ "pasubub $f2, $f2, $f10 \n\t" \
+ "biadd $f0, $f0 \n\t" \
+ "biadd $f2, $f2 \n\t" \
+ "pasubub $f4, $f4, $f12 \n\t" \
+ "pasubub $f6, $f6, $f14 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f24, $f24, $f0 \n\t" \
+ "paddh $f26, $f26, $f2 \n\t" \
+ "paddh $f24, $f24, $f4 \n\t" \
+ "paddh $f26, $f26, $f6 \n\t"
+
+#define CACHE_SPLIT_CHECK(r0, width, cacheline) \
+ "and "#r0", "#r0", 0x1f \n\t" \
+ PTR_ADDIU ""#r0", "#r0", -0x1f \n\t"
+
+#define MMI_GetSad2x16 \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
+ "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
+ "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f0, $f0, $f4 \n\t" \
+ "paddh $f2, $f2, $f6 \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
+ "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
+ "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f0, $f0, $f4 \n\t" \
+ "paddh $f2, $f2, $f6 \n\t"
+
+#define MMI_GetSad4x16 \
+ "gsldlc1 $f0, 0x7(%[pSample2]) \n\t" \
+ "gsldlc1 $f2, 0xF(%[pSample2]) \n\t" \
+ "gsldrc1 $f0, 0x0(%[pSample2]) \n\t" \
+ "gsldrc1 $f2, 0x8(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f0, $f0, $f8 \n\t" \
+ "pasubub $f2, $f2, $f10 \n\t" \
+ "biadd $f0, $f0 \n\t" \
+ "biadd $f2, $f2 \n\t" \
+ "paddh $f28, $f28, $f0 \n\t" \
+ "paddh $f30, $f30, $f2 \n\t" \
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
+ "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
+ "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t" \
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
+ "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
+ "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t" \
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
+ "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
+ "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t"
+
+#define MMI_GetSad4x16_Aligned \
+ "gslqc1 $f2, $f0, 0x0(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f0, $f0, $f8 \n\t" \
+ "pasubub $f2, $f2, $f10 \n\t" \
+ "biadd $f0, $f0 \n\t" \
+ "biadd $f2, $f2 \n\t" \
+ "paddh $f28, $f28, $f0 \n\t" \
+ "paddh $f30, $f30, $f2 \n\t" \
+ "gslqc1 $f6, $f4, 0x0(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t" \
+ "gslqc1 $f6, $f4, 0x0(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t" \
+ "gslqc1 $f6, $f4, 0x0(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t"
+
+#define MMI_GetSad4x16_End \
+ "gsldlc1 $f0, 0x7(%[pSample2]) \n\t" \
+ "gsldlc1 $f2, 0xF(%[pSample2]) \n\t" \
+ "gsldrc1 $f0, 0x0(%[pSample2]) \n\t" \
+ "gsldrc1 $f2, 0x8(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f0, $f0, $f8 \n\t" \
+ "pasubub $f2, $f2, $f10 \n\t" \
+ "biadd $f0, $f0 \n\t" \
+ "biadd $f2, $f2 \n\t" \
+ "paddh $f28, $f28, $f0 \n\t" \
+ "paddh $f30, $f30, $f2 \n\t" \
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
+ "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
+ "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t" \
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
+ "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
+ "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t" \
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t" \
+ "gsldlc1 $f6, 0xF(%[pSample2]) \n\t" \
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t" \
+ "gsldrc1 $f6, 0x8(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t"
+
+#define MMI_GetSad4x16_Aligned_End \
+ "gslqc1 $f2, $f0, 0x0(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f0, $f0, $f8 \n\t" \
+ "pasubub $f2, $f2, $f10 \n\t" \
+ "biadd $f0, $f0 \n\t" \
+ "biadd $f2, $f2 \n\t" \
+ "paddh $f28, $f28, $f0 \n\t" \
+ "paddh $f30, $f30, $f2 \n\t" \
+ "gslqc1 $f6, $f4, 0x0(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t" \
+ "gslqc1 $f6, $f4, 0x0(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t" \
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t" \
+ "gslqc1 $f6, $f4, 0x0(%[pSample2]) \n\t" \
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t" \
+ "pasubub $f4, $f4, $f8 \n\t" \
+ "pasubub $f6, $f6, $f10 \n\t" \
+ "biadd $f4, $f4 \n\t" \
+ "biadd $f6, $f6 \n\t" \
+ "paddh $f28, $f28, $f4 \n\t" \
+ "paddh $f30, $f30, $f6 \n\t"
+
+#define MMI_Get4LW16Sad(f0, f2, f4, f6, f8, f10, f12, f14, r0) \
+ "pasubub "#f0", "#f0", "#f12" \n\t" \
+ "pasubub "#f2", "#f2", "#f14" \n\t" \
+ "pasubub "#f12", "#f12", "#f8" \n\t" \
+ "pasubub "#f14", "#f14", "#f10" \n\t" \
+ "biadd "#f0", "#f0" \n\t" \
+ "biadd "#f2", "#f2" \n\t" \
+ "biadd "#f12", "#f12" \n\t" \
+ "biadd "#f14", "#f14" \n\t" \
+ "paddh $f20, $f20, "#f0" \n\t" \
+ "paddh $f22, $f22, "#f2" \n\t" \
+ "paddh $f16, $f16, "#f12" \n\t" \
+ "paddh $f18, $f18, "#f14" \n\t" \
+ "gsldlc1 "#f12", 0x6("#r0") \n\t" \
+ "gsldlc1 "#f14", 0xE("#r0") \n\t" \
+ "gsldrc1 "#f12", -0x1("#r0") \n\t" \
+ "gsldrc1 "#f14", 0x7("#r0") \n\t" \
+ "pasubub "#f12", "#f12", "#f4" \n\t" \
+ "pasubub "#f14", "#f14", "#f6" \n\t" \
+ "biadd "#f12", "#f12" \n\t" \
+ "biadd "#f14", "#f14" \n\t" \
+ "paddh $f24, $f24, "#f12" \n\t" \
+ "paddh $f26, $f26, "#f14" \n\t" \
+ "gsldlc1 "#f12", 0x8("#r0") \n\t" \
+ "gsldlc1 "#f14", 0x10("#r0") \n\t" \
+ "gsldrc1 "#f12", 0x1("#r0") \n\t" \
+ "gsldrc1 "#f14", 0x9("#r0") \n\t" \
+ "pasubub "#f12", "#f12", "#f4" \n\t" \
+ "pasubub "#f14", "#f14", "#f6" \n\t" \
+ "biadd "#f12", "#f12" \n\t" \
+ "biadd "#f14", "#f14" \n\t" \
+ "paddh $f28, $f28, "#f12" \n\t" \
+ "paddh $f30, $f30, "#f14" \n\t"
+
+#define MMI_HDMTwo4x4(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18) \
+ MMI_SumSub(f0, f2, f4, f6, f16, f18) \
+ MMI_SumSub(f8, f10, f12, f14, f16, f18) \
+ MMI_SumSub(f4, f6, f12, f14, f16, f18) \
+ MMI_SumSub(f0, f2, f8, f10, f16, f18)
+
+#define MMI_SumAbs4(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, f20, f22, f24, f26) \
+ WELS_AbsH(f0, f2, f0, f2, f8, f10) \
+ WELS_AbsH(f4, f6, f4, f6, f8, f10) \
+ WELS_AbsH(f12, f14, f12, f14, f20, f22) \
+ WELS_AbsH(f16, f18, f16, f18, f20, f22) \
+ "paddush "#f0", "#f0", "#f4" \n\t" \
+ "paddush "#f2", "#f2", "#f6" \n\t" \
+ "paddush "#f12", "#f12", "#f16" \n\t" \
+ "paddush "#f14", "#f14", "#f18" \n\t" \
+ "paddush "#f24", "#f24", "#f0" \n\t" \
+ "paddush "#f26", "#f26", "#f2" \n\t" \
+ "paddush "#f24", "#f24", "#f12" \n\t" \
+ "paddush "#f26", "#f26", "#f14" \n\t"
+
+#define MMI_SumWHorizon(f0, f2, f4, f6, f8, f10) \
+ "paddh "#f0", "#f0", "#f2" \n\t" \
+ "punpckhhw "#f2", "#f0", "#f8" \n\t" \
+ "punpcklhw "#f0", "#f0", "#f8" \n\t" \
+ "paddw "#f0", "#f0", "#f2" \n\t" \
+ "pshufh "#f2", "#f0", "#f10" \n\t" \
+ "paddw "#f0", "#f0", "#f2" \n\t"
+
+#define MMI_LoadDiff8P_Offset_Stride0(f0, f2, f4, f6, f8, r0, r1) \
+ "gsldlc1 "#f0", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r1") \n\t" \
+ PTR_ADDU "$11, %[pSample1], %[iStride1] \n\t" \
+ "gsldrc1 "#f0", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r1") \n\t" \
+ PTR_ADDU "$12, %[pSample2], %[iStride2] \n\t" \
+ "punpckhbh "#f2", "#f0", "#f8" \n\t" \
+ "punpcklbh "#f0", "#f0", "#f8" \n\t" \
+ "punpckhbh "#f6", "#f4", "#f8" \n\t" \
+ "punpcklbh "#f4", "#f4", "#f8" \n\t" \
+ "psubh "#f0", "#f0", "#f4" \n\t" \
+ "psubh "#f2", "#f2", "#f6" \n\t"
+
+#define MMI_LoadDiff8P_Offset_Stride1(f0, f2, f4, f6, f8, r0, r1) \
+ "gsldlc1 "#f0", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r1") \n\t" \
+ PTR_ADDU "%[pSample1], $11, %[iStride1] \n\t" \
+ "gsldrc1 "#f0", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r1") \n\t" \
+ PTR_ADDU "%[pSample2], $12, %[iStride2] \n\t" \
+ "punpckhbh "#f2", "#f0", "#f8" \n\t" \
+ "punpcklbh "#f0", "#f0", "#f8" \n\t" \
+ "punpckhbh "#f6", "#f4", "#f8" \n\t" \
+ "punpcklbh "#f4", "#f4", "#f8" \n\t" \
+ "psubh "#f0", "#f0", "#f4" \n\t" \
+ "psubh "#f2", "#f2", "#f6" \n\t"
+
+#define MMI_LoadDiff8P_Offset8(f0, f2, f4, f6, f8, r0, r1) \
+ "gsldlc1 "#f0", 0x7("#r0") \n\t" \
+ "gsldlc1 "#f4", 0x7("#r1") \n\t" \
+ PTR_ADDU "%[pSample1], $9, 0x8 \n\t" \
+ "gsldrc1 "#f0", 0x0("#r0") \n\t" \
+ "gsldrc1 "#f4", 0x0("#r1") \n\t" \
+ PTR_ADDU "%[pSample2], $10, 0x8 \n\t" \
+ "punpckhbh "#f2", "#f0", "#f8" \n\t" \
+ "punpcklbh "#f0", "#f0", "#f8" \n\t" \
+ "punpckhbh "#f6", "#f4", "#f8" \n\t" \
+ "punpcklbh "#f4", "#f4", "#f8" \n\t" \
+ "psubh "#f0", "#f0", "#f4" \n\t" \
+ "psubh "#f2", "#f2", "#f6" \n\t"
+
+#define MMI_GetSatd8x8 \
+ MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12) \
+ MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset_Stride1($f12, $f14, $f20, $f22, $f28, $11, $12) \
+ MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18) \
+ MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18) \
+ MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22) \
+ MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26) \
+ MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12) \
+ MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset_Stride1($f12, $f14, $f20, $f22, $f28, $11, $12) \
+ MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18) \
+ MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18) \
+ MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22) \
+ MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26)
+
+#define MMI_GetSatd8x8_Offset8 \
+ MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12) \
+ MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset_Stride1($f12, $f14, $f20, $f22, $f28, $11, $12) \
+ MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18) \
+ MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18) \
+ MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22) \
+ MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26) \
+ MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12) \
+ MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset8($f12, $f14, $f20, $f22, $f28, $11, $12) \
+ MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18) \
+ MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18) \
+ MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22) \
+ MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26)
+
+#define MMI_GetSatd8x8_End \
+ MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12) \
+ MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset_Stride1($f12, $f14, $f20, $f22, $f28, $11, $12) \
+ MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18) \
+ MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18) \
+ MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22) \
+ MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26) \
+ MMI_LoadDiff8P_Offset_Stride0($f0, $f2, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P_Offset_Stride1($f4, $f6, $f20, $f22, $f28, $11, $12) \
+ MMI_LoadDiff8P_Offset_Stride0($f8, $f10, $f16, $f18, $f28, %[pSample1], %[pSample2]) \
+ MMI_LoadDiff8P($f12, $f14, $f20, $f22, $f28, $11, $12) \
+ MMI_HDMTwo4x4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18) \
+ MMI_TransTwo4x4H($f12, $f14, $f4, $f6, $f0, $f2, $f8, $f10, $f16, $f18) \
+ MMI_HDMTwo4x4($f12, $f14, $f4, $f6, $f8, $f10, $f16, $f18, $f20, $f22) \
+ MMI_SumAbs4($f16, $f18, $f4, $f6, $f0, $f2, $f8, $f10, $f12, $f14, $f20, $f22, $f24, $f26)
+
+int32_t WelsSampleSad16x16_mmi (uint8_t* pSample1, int32_t iStride1,
+ uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSadSum = 0;
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "and $8, %[pSample2], 0xF \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+ "bnez $8, unaligned \n\t"
+ "aligned: \n\t"
+ MMI_GetSad4x16_Aligned
+ MMI_GetSad4x16_Aligned
+ MMI_GetSad4x16_Aligned
+ MMI_GetSad4x16_Aligned_End
+ "b out \n\t"
+
+ "unaligned: \n\t"
+ MMI_GetSad4x16
+ MMI_GetSad4x16
+ MMI_GetSad4x16
+ MMI_GetSad4x16_End
+ "out: \n\t"
+ "mov.d $f0, $f30 \n\t"
+ "paddh $f0, $f0, $f28 \n\t"
+ "dmfc1 %[iSadSum], $f0 \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+ : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+ return iSadSum;
+}
+
+int32_t WelsSampleSad16x8_mmi (uint8_t* pSample1, int32_t iStride1,
+ uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSadSum = 0;
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "gsldlc1 $f0, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f2, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f2, 0x8(%[pSample2]) \n\t"
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t"
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
+ "pasubub $f0, $f0, $f8 \n\t"
+ "pasubub $f2, $f2, $f10 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f6, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f6, 0x8(%[pSample2]) \n\t"
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t"
+ "pasubub $f4, $f4, $f8 \n\t"
+ "pasubub $f6, $f6, $f10 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+
+ MMI_GetSad2x16
+ MMI_GetSad2x16
+ MMI_GetSad2x16
+
+ "paddh $f0, $f0, $f2 \n\t"
+ "dmfc1 %[iSadSum], $f0 \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+ : "memory", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26"
+ );
+ RECOVER_REG;
+ return iSadSum;
+}
+
+int32_t WelsSampleSad8x16_mmi (uint8_t* pSample1, int32_t iStride1,
+ uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSadSum = 0;
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ MMI_GetSad8x4
+ MMI_GetSad8x4
+ MMI_GetSad8x4
+ MMI_GetSad8x4_End
+ "paddh $f0, $f26, $f24 \n\t"
+ "dmfc1 %[iSadSum], $f0 \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+ : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26"
+ );
+ RECOVER_REG;
+ return iSadSum;
+}
+
+int32_t WelsSampleSad4x4_mmi (uint8_t* pSample1, int32_t iStride1,
+ uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSadSum = 0;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "punpcklwd $f0, $f0, $f2 \n\t"
+
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "gsldlc1 $f6, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f8, 0x7($9) \n\t"
+ "gsldrc1 $f6, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f8, 0x0($9) \n\t"
+ "punpcklwd $f6, $f6, $f8 \n\t"
+ "pasubub $f0, $f0, $f6 \n\t"
+ "biadd $f0, $f0 \n\t"
+
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "gsldlc1 $f2, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f4, 0x7($8) \n\t"
+ "gsldrc1 $f2, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f4, 0x0($8) \n\t"
+ "punpcklwd $f2, $f2, $f4 \n\t"
+
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "gsldlc1 $f6, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f8, 0x7($9) \n\t"
+ "gsldrc1 $f6, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f8, 0x0($9) \n\t"
+ "punpcklwd $f6, $f6, $f8 \n\t"
+ "pasubub $f2, $f2, $f6 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f0, $f0, $f2 \n\t"
+
+ "dmfc1 %[iSadSum], $f0 \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+ : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8"
+ );
+ return iSadSum;
+}
+
+int32_t WelsSampleSad8x8_mmi (uint8_t* pSample1, int32_t iStride1,
+ uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSadSum = 0;
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ CACHE_SPLIT_CHECK($8, 8, 32)
+ "blez $8, 1f \n\t"
+ "nop \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+
+ "move $9, %[pSample2] \n\t"
+ "and $9, $9, 0x7 \n\t"
+ PTR_SUBU "%[pSample2], %[pSample2], $9 \n\t"
+ "dli $8, 0x8 \n\t"
+ PTR_SUBU "$8, $8, $9 \n\t"
+
+ "dsll $9, $9, 0x3 \n\t"
+ "dsll $8, $8, 0x3 \n\t"
+ "dmtc1 $9, $f20 \n\t"
+ "dmtc1 $8, $f24 \n\t"
+ "dli $9, 0x8 \n\t"
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ PTR_ADDU "$9, $9, %[pSample2] \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
+ "gsldlc1 $f2, 0x7(%[pSample1]) \n\t"
+
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f8, 0x7($9) \n\t"
+ "gsldrc1 $f2, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f8, 0x0($9) \n\t"
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ "gsldlc1 $f6, 0x7(%[pSample2]) \n\t"
+ PTR_ADDU "$9, $9, %[iStride2] \n\t"
+ "gsldrc1 $f6, 0x0(%[pSample2]) \n\t"
+ "gsldlc1 $f10, 0x7($9) \n\t"
+ "dsrl $f4, $f4, $f20 \n\t"
+ "gsldrc1 $f10, 0x0($9) \n\t"
+ "dsrl $f6, $f6, $f20 \n\t"
+ "dsll $f8, $f8, $f24 \n\t"
+ "dsll $f10, $f10, $f24 \n\t"
+ "or $f4, $f4, $f8 \n\t"
+ "or $f6, $f6, $f10 \n\t"
+
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ "pasubub $f0, $f0, $f4 \n\t"
+ "pasubub $f2, $f2, $f6 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f28, $f28, $f0 \n\t"
+ "paddh $f30, $f30, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ PTR_ADDU "$9, $9, %[iStride2] \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
+ "gsldlc1 $f2, 0x7(%[pSample1]) \n\t"
+
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f8, 0x7($9) \n\t"
+ "gsldrc1 $f2, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f8, 0x0($9) \n\t"
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "$9, $9, %[iStride2] \n\t"
+ "gsldlc1 $f6, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f10, 0x7($9) \n\t"
+ "gsldrc1 $f6, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f10, 0x0($9) \n\t"
+ "dsrl $f4, $f4, $f20 \n\t"
+ "dsrl $f6, $f6, $f20 \n\t"
+ "dsll $f8, $f8, $f24 \n\t"
+ "dsll $f10, $f10, $f24 \n\t"
+ "or $f4, $f4, $f8 \n\t"
+ "or $f6, $f6, $f10 \n\t"
+
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "$9, $9, %[iStride2] \n\t"
+
+ "pasubub $f0, $f0, $f4 \n\t"
+ "pasubub $f2, $f2, $f6 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f28, $f28, $f0 \n\t"
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "paddh $f30, $f30, $f2 \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
+ "gsldlc1 $f2, 0x7(%[pSample1]) \n\t"
+
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f8, 0x7($9) \n\t"
+ "gsldrc1 $f2, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f8, 0x0($9) \n\t"
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "$9, $9, %[iStride2] \n\t"
+ "gsldlc1 $f6, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f10, 0x7($9) \n\t"
+ "gsldrc1 $f6, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f10, 0x0($9) \n\t"
+ "dsrl $f4, $f4, $f20 \n\t"
+ "dsrl $f6, $f6, $f20 \n\t"
+ "dsll $f8, $f8, $f24 \n\t"
+ "dsll $f10, $f10, $f24 \n\t"
+ "or $f4, $f4, $f8 \n\t"
+ "or $f6, $f6, $f10 \n\t"
+
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "$9, $9, %[iStride2] \n\t"
+
+ "pasubub $f0, $f0, $f4 \n\t"
+ "pasubub $f2, $f2, $f6 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f28, $f28, $f0 \n\t"
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "paddh $f30, $f30, $f2 \n\t"
+
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
+ "gsldlc1 $f2, 0x7(%[pSample1]) \n\t"
+
+ "gsldlc1 $f4, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f8, 0x7($9) \n\t"
+ "gsldrc1 $f2, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f4, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f8, 0x0($9) \n\t"
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "$9, $9, %[iStride2] \n\t"
+ "gsldlc1 $f6, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f10, 0x7($9) \n\t"
+ "gsldrc1 $f6, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f10, 0x0($9) \n\t"
+ "dsrl $f4, $f4, $f20 \n\t"
+ "dsrl $f6, $f6, $f20 \n\t"
+ "dsll $f8, $f8, $f24 \n\t"
+ "dsll $f10, $f10, $f24 \n\t"
+ "or $f4, $f4, $f8 \n\t"
+ "or $f6, $f6, $f10 \n\t"
+
+ "pasubub $f0, $f0, $f4 \n\t"
+ "pasubub $f2, $f2, $f6 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f28, $f28, $f0 \n\t"
+ "paddh $f30, $f30, $f2 \n\t"
+
+ "mov.d $f0, $f30 \n\t"
+ "paddh $f0, $f0, $f28 \n\t"
+ "dmfc1 %[iSadSum], $f0 \n\t"
+ "j 2f \n\t"
+ "nop \n\t"
+
+ "1: \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ MMI_GetSad8x4
+ MMI_GetSad8x4_End
+ "paddh $f0, $f26, $f24 \n\t"
+ "dmfc1 %[iSadSum], $f0 \n\t"
+ "2: \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1), [iSadSum]"=r"((int)iSadSum),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+ : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+ return iSadSum;
+}
+
+int32_t WelsSampleSatd4x4_mmi (uint8_t* pSample1, int32_t iStride1,
+ uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSatdSum = 0;
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f4, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f4, 0x0($8) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "gsldlc1 $f8, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7($8) \n\t"
+ "gsldrc1 $f8, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f12, 0x0($8) \n\t"
+ "punpcklwd $f0, $f0, $f8 \n\t"
+ "punpcklwd $f4, $f4, $f12 \n\t"
+
+ PTR_ADDU "$8, %[pSample2], %[iStride2] \n\t"
+ "gsldlc1 $f16, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f20, 0x7($8) \n\t"
+ "gsldrc1 $f16, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f20, 0x0($8) \n\t"
+ PTR_ADDU "%[pSample2], $8, %[iStride2] \n\t"
+ PTR_ADDU "$8, %[pSample2], %[iStride2] \n\t"
+ "gsldlc1 $f24, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f28, 0x7($8) \n\t"
+ "gsldrc1 $f24, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f28, 0x0($8) \n\t"
+ "punpcklwd $f16, $f16, $f24 \n\t"
+ "punpcklwd $f20, $f20, $f28 \n\t"
+
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "punpckhbh $f2, $f0, $f24 \n\t"
+ "punpcklbh $f0, $f0, $f24 \n\t"
+ "punpckhbh $f6, $f4, $f24 \n\t"
+ "punpcklbh $f4, $f4, $f24 \n\t"
+ "punpckhbh $f18, $f16, $f24 \n\t"
+ "punpcklbh $f16, $f16, $f24 \n\t"
+ "punpckhbh $f22, $f20, $f24 \n\t"
+ "punpcklbh $f20, $f20, $f24 \n\t"
+
+ "psubh $f0, $f0, $f16 \n\t"
+ "psubh $f2, $f2, $f18 \n\t"
+ "psubh $f4, $f4, $f20 \n\t"
+ "psubh $f6, $f6, $f22 \n\t"
+
+ "mov.d $f8, $f0 \n\t"
+ "mov.d $f10, $f2 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+ "psubh $f8, $f8, $f4 \n\t"
+ "psubh $f10, $f10, $f6 \n\t"
+ MMI_XSawp_DQ($f0, $f2, $f8, $f10, $f12, $f14)
+
+ "mov.d $f16, $f0 \n\t"
+ "mov.d $f18, $f2 \n\t"
+ "paddh $f0, $f0, $f12 \n\t"
+ "paddh $f2, $f2, $f14 \n\t"
+ "psubh $f16, $f16, $f12 \n\t"
+ "psubh $f18, $f18, $f14 \n\t"
+
+ "mov.d $f8, $f2 \n\t"
+ "punpckhhw $f2, $f0, $f16 \n\t"
+ "punpcklhw $f0, $f0, $f16 \n\t"
+ "punpcklhw $f16, $f18, $f8 \n\t"
+ "punpckhhw $f18, $f18, $f8 \n\t"
+
+ MMI_XSawp_WD($f0, $f2, $f16, $f18, $f12, $f14)
+ MMI_XSawp_DQ($f0, $f2, $f12, $f14, $f20, $f22)
+
+ "mov.d $f28, $f0 \n\t"
+ "mov.d $f30, $f2 \n\t"
+ "paddh $f0, $f0, $f20 \n\t"
+ "paddh $f2, $f2, $f22 \n\t"
+ "psubh $f28, $f28, $f20 \n\t"
+ "psubh $f30, $f30, $f22 \n\t"
+
+ MMI_XSawp_DQ($f0, $f2, $f28, $f30, $f4, $f6)
+
+ "psubh $f8, $f0, $f4 \n\t"
+ "psubh $f10, $f2, $f6 \n\t"
+ "paddh $f0, $f0, $f4 \n\t"
+ "paddh $f2, $f2, $f6 \n\t"
+
+ WELS_AbsH($f0, $f2, $f0, $f2, $f12, $f14)
+ "paddush $f24, $f24, $f0 \n\t"
+ "paddush $f26, $f26, $f2 \n\t"
+ WELS_AbsH($f8, $f10, $f8, $f10, $f16, $f18)
+ "paddush $f24, $f24, $f8 \n\t"
+ "paddush $f26, $f26, $f10 \n\t"
+ MMI_SumWHorizon1($f24, $f26, $f16, $f18, $f28, $f30, $8)
+
+ "dmfc1 $8, $f24 \n\t"
+ "dli $9, 0xffff \n\t"
+ "and $8, $8, $9 \n\t"
+ "dsrl %[iSatdSum], $8, 0x1 \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+ : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+ return iSatdSum;
+}
+
+int32_t WelsSampleSatd8x8_mmi (uint8_t* pSample1, int32_t iStride1,
+ uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSatdSum = 0;
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "dli $8, 0x1 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ MMI_GetSatd8x8_End
+ "psrlh $f24, $f24, $f30 \n\t"
+ "dli $8, 0x4e \n\t"
+ "psrlh $f26, $f26, $f30 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ MMI_SumWHorizon($f24, $f26, $f16, $f18, $f28, $f30)
+ "mfc1 %[iSatdSum], $f24 \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+ : "memory", "$8", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
+ "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+ return iSatdSum;
+}
+
+int32_t WelsSampleSatd8x16_mmi (uint8_t* pSample1, int32_t iStride1,
+ uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSatdSum = 0;
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "dli $8, 0x1 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ MMI_GetSatd8x8
+ MMI_GetSatd8x8_End
+ "psrlh $f24, $f24, $f30 \n\t"
+ "dli $8, 0x4e \n\t"
+ "psrlh $f26, $f26, $f30 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ MMI_SumWHorizon($f24, $f26, $f16, $f18, $f28, $f30)
+ "mfc1 %[iSatdSum], $f24 \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+ : "memory", "$8", "$11", "$12", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10",
+ "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+ return iSatdSum;
+}
+
+int32_t WelsSampleSatd16x8_mmi (uint8_t* pSample1, int32_t iStride1,
+ uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSatdSum = 0;
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "dli $8, 0x1 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ "move $9, %[pSample1] \n\t"
+ "move $10, %[pSample2] \n\t"
+ MMI_GetSatd8x8_Offset8
+
+ MMI_GetSatd8x8_End
+ "psrlh $f24, $f24, $f30 \n\t"
+ "dli $8, 0x4e \n\t"
+ "psrlh $f26, $f26, $f30 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ MMI_SumWHorizon($f24, $f26, $f16, $f18, $f28, $f30)
+ "mfc1 %[iSatdSum], $f24 \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6",
+ "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
+ "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+ return iSatdSum;
+}
+
+int32_t WelsSampleSatd16x16_mmi (uint8_t* pSample1, int32_t iStride1,
+ uint8_t* pSample2, int32_t iStride2) {
+ int32_t iSatdSum = 0;
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "dli $8, 0x1 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "dmtc1 $8, $f30 \n\t"
+ "move $9, %[pSample1] \n\t"
+ "move $10, %[pSample2] \n\t"
+
+ MMI_GetSatd8x8
+ MMI_GetSatd8x8_Offset8
+
+ MMI_GetSatd8x8
+ MMI_GetSatd8x8_End
+
+ "dli $8, 0x4e \n\t"
+ "psrlh $f24, $f24, $f30 \n\t"
+ "dmtc1 $8, $f0 \n\t"
+ "psrlh $f26, $f26, $f30 \n\t"
+ MMI_SumWHorizon($f24, $f26, $f16, $f18, $f28, $f0)
+ "mfc1 %[iSatdSum], $f24 \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1), [iSatdSum]"=r"((int)iSatdSum),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2)
+ : "memory", "$8", "$9", "$10", "$11", "$12", "$f0", "$f2", "$f4", "$f6",
+ "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24",
+ "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+ return iSatdSum;
+}
+
+void WelsSampleSadFour16x16_mmi (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2,
+ int32_t iStride2, int32_t* pSad) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f16, $f16, $f16 \n\t"
+ "xor $f18, $f18, $f18 \n\t"
+ "xor $f20, $f20, $f20 \n\t"
+ "xor $f22, $f22, $f22 \n\t"
+ PTR_SUBU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gslqc1 $f6, $f4, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ "pasubub $f12, $f12, $f4 \n\t"
+ "pasubub $f14, $f14, $f6 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f8, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f10, 0xE(%[pSample2]) \n\t"
+ "gsldrc1 $f8, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f10, 0x7(%[pSample2]) \n\t"
+ "pasubub $f8, $f8, $f0 \n\t"
+ "pasubub $f10, $f10, $f2 \n\t"
+ "biadd $f8, $f8 \n\t"
+ "biadd $f10, $f10 \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0x10(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x9(%[pSample2]) \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gslqc1 $f10, $f8, 0x0($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $9)
+ "gslqc1 $f2, $f0, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, %[pSample2])
+ "gslqc1 $f6, $f4, 0x0($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, $9)
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, %[pSample2])
+ "gslqc1 $f2, $f0, 0x0($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, $9)
+ "gslqc1 $f6, $f4, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, %[pSample2])
+
+ "gslqc1 $f10, $f8, 0x0($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $9)
+ "gslqc1 $f2, $f0, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, %[pSample2])
+ "gslqc1 $f6, $f4, 0x0($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, $9)
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, %[pSample2])
+ "gslqc1 $f2, $f0, 0x0($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, $9)
+ "gslqc1 $f6, $f4, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, %[pSample2])
+
+ "gslqc1 $f10, $f8, 0x0($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $9)
+ "gslqc1 $f2, $f0, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, %[pSample2])
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ "pasubub $f8, $f8, $f12 \n\t"
+ "pasubub $f10, $f10, $f14 \n\t"
+ "biadd $f8, $f8 \n\t"
+ "biadd $f10, $f10 \n\t"
+ "paddh $f20, $f20, $f8 \n\t"
+ "paddh $f22, $f22, $f10 \n\t"
+
+ "gsldlc1 $f8, 0x6($9) \n\t"
+ "gsldlc1 $f10, 0xE($9) \n\t"
+ "gsldrc1 $f8, -0x1($9) \n\t"
+ "gsldrc1 $f10, 0x7($9) \n\t"
+ "pasubub $f8, $f8, $f0 \n\t"
+ "pasubub $f10, $f10, $f2 \n\t"
+ "biadd $f8, $f8 \n\t"
+ "biadd $f10, $f10 \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+
+ "gsldlc1 $f12, 0x8($9) \n\t"
+ "gsldlc1 $f14, 0x10($9) \n\t"
+ "gsldrc1 $f12, 0x1($9) \n\t"
+ "gsldrc1 $f14, 0x9($9) \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "paddh $f16, $f16, $f18 \n\t"
+ "paddh $f20, $f20, $f22 \n\t"
+ "paddh $f24, $f24, $f26 \n\t"
+ "paddh $f28, $f28, $f30 \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "punpcklwd $f24, $f24, $f28 \n\t"
+ "gssqc1 $f24, $f16, 0x0(%[pSad]) \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2),
+ [pSad]"r"((int *)pSad)
+ : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsSampleSadFour16x8_mmi (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2,
+ int32_t iStride2, int32_t* pSad) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f16, $f16, $f16 \n\t"
+ "xor $f18, $f18, $f18 \n\t"
+ "xor $f20, $f20, $f20 \n\t"
+ "xor $f22, $f22, $f22 \n\t"
+ "gslqc1 $f2, $f0, 0x0(%[pSample1]) \n\t"
+ PTR_SUBU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "%[pSample1], %[pSample1], %[iStride1] \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], %[pSample2], %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gslqc1 $f6, $f4, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ "pasubub $f12, $f12, $f4 \n\t"
+ "pasubub $f14, $f14, $f6 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f8, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f10, 0xE(%[pSample2]) \n\t"
+ "gsldrc1 $f8, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f10, 0x7(%[pSample2]) \n\t"
+ "pasubub $f8, $f8, $f0 \n\t"
+ "pasubub $f10, $f10, $f2 \n\t"
+ "biadd $f8, $f8 \n\t"
+ "biadd $f10, $f10 \n\t"
+ "paddh $f24, $f24, $f8 \n\t"
+ "paddh $f26, $f26, $f10 \n\t"
+
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0x10(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x9(%[pSample2]) \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gslqc1 $f10, $f8, 0x0($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $9)
+ "gslqc1 $f2, $f0, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, %[pSample2])
+ "gslqc1 $f6, $f4, 0x0($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, $9)
+ "gslqc1 $f10, $f8, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, %[pSample2])
+ "gslqc1 $f2, $f0, 0x0($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f4, $f6, $f8, $f10, $f0, $f2, $f12, $f14, $9)
+ "gslqc1 $f6, $f4, 0x0(%[pSample1]) \n\t"
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ MMI_Get4LW16Sad($f8, $f10, $f0, $f2, $f4, $f6, $f12, $f14, %[pSample2])
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0xF($9) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x8($9) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x6($9) \n\t"
+ "gsldlc1 $f2, 0xE($9) \n\t"
+ "gsldrc1 $f0, -0x1($9) \n\t"
+ "gsldrc1 $f2, 0x7($9) \n\t"
+ "pasubub $f0, $f0, $f4 \n\t"
+ "pasubub $f2, $f2, $f6 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f24, $f24, $f0 \n\t"
+ "paddh $f26, $f26, $f2 \n\t"
+
+ "gsldlc1 $f12, 0x8($9) \n\t"
+ "gsldlc1 $f14, 0x10($9) \n\t"
+ "gsldrc1 $f12, 0x1($9) \n\t"
+ "gsldrc1 $f14, 0x9($9) \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "pasubub $f12, $f12, $f4 \n\t"
+ "pasubub $f14, $f14, $f6 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7(%[pSample2]) \n\t"
+ "gsldlc1 $f14, 0xF(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x8(%[pSample2]) \n\t"
+ "pasubub $f4, $f4, $f12 \n\t"
+ "pasubub $f6, $f6, $f14 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f20, $f20, $f4 \n\t"
+ "paddh $f22, $f22, $f6 \n\t"
+
+ "paddh $f16, $f16, $f18 \n\t"
+ "paddh $f20, $f20, $f22 \n\t"
+ "paddh $f24, $f24, $f26 \n\t"
+ "paddh $f28, $f28, $f30 \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "punpcklwd $f24, $f24, $f28 \n\t"
+ "gssqc1 $f24, $f16, 0x0(%[pSad]) \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2),
+ [pSad]"r"((int *)pSad)
+ : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28","$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsSampleSadFour8x16_mmi (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2,
+ int32_t iStride2, int32_t* pSad) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f16, $f16, $f16 \n\t"
+ "xor $f18, $f18, $f18 \n\t"
+ "xor $f20, $f20, $f20 \n\t"
+ "xor $f22, $f22, $f22 \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_SUBU "$9, %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "paddh $f16, $f16, $f18 \n\t"
+ "paddh $f20, $f20, $f22 \n\t"
+ "paddh $f24, $f24, $f26 \n\t"
+ "paddh $f28, $f28, $f30 \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "punpcklwd $f24, $f24, $f28 \n\t"
+ "gssqc1 $f24, $f16, 0x0(%[pSad]) \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2),
+ [pSad]"r"((int *)pSad)
+ : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30"
+ );
+ RECOVER_REG;
+}
+
+void WelsSampleSadFour8x8_mmi (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2,
+ int32_t iStride2, int32_t* pSad) {
+ BACKUP_REG;
+ __asm__ volatile (
+ ".set arch=loongson3a \n\t"
+ "xor $f16, $f16, $f16 \n\t"
+ "xor $f18, $f18, $f18 \n\t"
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "xor $f20, $f20, $f20 \n\t"
+ "xor $f22, $f22, $f22 \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "xor $f24, $f24, $f24 \n\t"
+ "xor $f26, $f26, $f26 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ PTR_SUBU "$9, %[pSample2], %[iStride2] \n\t"
+ "xor $f28, $f28, $f28 \n\t"
+ "xor $f30, $f30, $f30 \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ PTR_ADDU "$8, %[pSample1], %[iStride1] \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "gsldlc1 $f0, 0x7(%[pSample1]) \n\t"
+ "gsldlc1 $f2, 0x7($8) \n\t"
+ "gsldrc1 $f0, 0x0(%[pSample1]) \n\t"
+ "gsldrc1 $f2, 0x0($8) \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f16, $f16, $f12 \n\t"
+ "paddh $f18, $f18, $f14 \n\t"
+
+ "gsldlc1 $f4, 0x6(%[pSample2]) \n\t"
+ "gsldlc1 $f12, 0x8(%[pSample2]) \n\t"
+ PTR_ADDU "$9, %[pSample2], %[iStride2] \n\t"
+ PTR_ADDU "%[pSample1], $8, %[iStride1] \n\t"
+ "gsldrc1 $f4, -0x1(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x1(%[pSample2]) \n\t"
+
+ "gsldlc1 $f6, 0x6($9) \n\t"
+ "gsldlc1 $f14, 0x8($9) \n\t"
+ "gsldrc1 $f6, -0x1($9) \n\t"
+ "gsldrc1 $f14, 0x1($9) \n\t"
+
+ "pasubub $f4, $f4, $f0 \n\t"
+ "pasubub $f6, $f6, $f2 \n\t"
+ "biadd $f4, $f4 \n\t"
+ "biadd $f6, $f6 \n\t"
+ "paddh $f24, $f24, $f4 \n\t"
+ "paddh $f26, $f26, $f6 \n\t"
+ "pasubub $f12, $f12, $f0 \n\t"
+ "pasubub $f14, $f14, $f2 \n\t"
+ PTR_ADDU "%[pSample2], $9, %[iStride2] \n\t"
+ "biadd $f12, $f12 \n\t"
+ "biadd $f14, $f14 \n\t"
+ "paddh $f28, $f28, $f12 \n\t"
+ "paddh $f30, $f30, $f14 \n\t"
+
+ "gsldlc1 $f12, 0x7($9) \n\t"
+ "gsldlc1 $f14, 0x7(%[pSample2]) \n\t"
+ "gsldrc1 $f12, 0x0($9) \n\t"
+ "gsldrc1 $f14, 0x0(%[pSample2]) \n\t"
+ "pasubub $f0, $f0, $f12 \n\t"
+ "pasubub $f2, $f2, $f14 \n\t"
+ "biadd $f0, $f0 \n\t"
+ "biadd $f2, $f2 \n\t"
+ "paddh $f20, $f20, $f0 \n\t"
+ "paddh $f22, $f22, $f2 \n\t"
+
+ "paddh $f16, $f16, $f18 \n\t"
+ "paddh $f20, $f20, $f22 \n\t"
+ "paddh $f24, $f24, $f26 \n\t"
+ "paddh $f28, $f28, $f30 \n\t"
+ "punpcklwd $f16, $f16, $f20 \n\t"
+ "punpcklwd $f24, $f24, $f28 \n\t"
+ "gssqc1 $f24, $f16, 0x0(%[pSad]) \n\t"
+ : [pSample1]"+&r"((unsigned char *)pSample1),
+ [pSample2]"+&r"((unsigned char *)pSample2)
+ : [iStride1]"r"((int)iStride1), [iStride2]"r"((int)iStride2),
+ [pSad]"r"((int *)pSad)
+ : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
+ "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28","$f30"
+ );
+ RECOVER_REG;
+}
--- a/codec/common/src/expand_pic.cpp
+++ b/codec/common/src/expand_pic.cpp
@@ -140,6 +140,13 @@
pExpandPicFunc->pfExpandChromaPicture[1] = ExpandPictureChroma_AArch64_neon;
}
#endif//HAVE_NEON_AARCH64
+#if defined(HAVE_MMI)
+ if (kuiCPUFlag & WELS_CPU_MMI) {
+ pExpandPicFunc->pfExpandLumaPicture = ExpandPictureLuma_mmi;
+ pExpandPicFunc->pfExpandChromaPicture[0] = ExpandPictureChromaUnalign_mmi;
+ pExpandPicFunc->pfExpandChromaPicture[1] = ExpandPictureChromaAlign_mmi;
+ }
+#endif//HAVE_MMI
}
--- a/codec/encoder/core/inc/get_intra_predictor.h
+++ b/codec/encoder/core/inc/get_intra_predictor.h
@@ -153,6 +153,16 @@
void WelsIChromaPredPlane_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
void WelsIChromaPredDcTop_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
#endif//HAVE_NEON_AARCH64
+
+#if defined(HAVE_MMI)
+void WelsI16x16LumaPredDc_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsI16x16LumaPredPlane_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+
+void WelsIChromaPredH_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChromaPredV_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChromaPredDc_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+void WelsIChromaPredPlane_mmi (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
+#endif//HAVE_MMI
#if defined(__cplusplus)
}
#endif//__cplusplus
--- a/codec/encoder/core/inc/sample.h
+++ b/codec/encoder/core/inc/sample.h
@@ -124,6 +124,14 @@
int32_t WelsIntra4x4Combined3Satd_AArch64_neon (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t*, int32_t, int32_t,
int32_t);
#endif
+
+#if defined (HAVE_MMI)
+int32_t WelsSampleSatd8x8_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd16x8_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd8x16_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd16x16_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+int32_t WelsSampleSatd4x4_mmi (uint8_t*, int32_t, uint8_t*, int32_t);
+#endif//HAVE_MMI
#if defined(__cplusplus)
}
#endif//__cplusplus
--- a/codec/encoder/core/src/deblocking.cpp
+++ b/codec/encoder/core/src/deblocking.cpp
@@ -778,6 +778,11 @@
*pfSetNZCZero = WelsNonZeroCount_sse2;
}
#endif
+#if defined(HAVE_MMI)
+ if (iCpu & WELS_CPU_MMI) {
+ *pfSetNZCZero = WelsNonZeroCount_mmi;
+ }
+#endif
}
void DeblockingInit (DeblockingFunc* pFunc, int32_t iCpu) {
@@ -842,6 +847,19 @@
#endif
}
#endif
+
+#if defined(HAVE_MMI)
+ if (iCpu & WELS_CPU_MMI) {
+ pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_mmi;
+ pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_mmi;
+ pFunc->pfLumaDeblockingLT4Hor = DeblockLumaLt4H_mmi;
+ pFunc->pfLumaDeblockingEQ4Hor = DeblockLumaEq4H_mmi;
+ pFunc->pfChromaDeblockingLT4Ver = DeblockChromaLt4V_mmi;
+ pFunc->pfChromaDeblockingEQ4Ver = DeblockChromaEq4V_mmi;
+ pFunc->pfChromaDeblockingLT4Hor = DeblockChromaLt4H_mmi;
+ pFunc->pfChromaDeblockingEQ4Hor = DeblockChromaEq4H_mmi;
+ }
+#endif//HAVE_MMI
}
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -586,5 +586,16 @@
pFuncList->pfDctFourT4 = WelsDctFourT4_AArch64_neon;
}
#endif
+
+#if defined(HAVE_MMI)
+ if (uiCpuFlag & WELS_CPU_MMI) {
+ pFuncList->pfCopy8x8Aligned = WelsCopy8x8_mmi;
+ pFuncList->pfCopy8x16Aligned = WelsCopy8x16_mmi;
+
+ pFuncList->pfCopy16x16Aligned = WelsCopy16x16_mmi;
+ pFuncList->pfCopy16x16NotAligned = WelsCopy16x16NotAligned_mmi;
+ pFuncList->pfCopy16x8NotAligned = WelsCopy16x8NotAligned_mmi;
+ }
+#endif//HAVE_MMI
}
}
--- a/codec/encoder/core/src/get_intra_predictor.cpp
+++ b/codec/encoder/core/src/get_intra_predictor.cpp
@@ -720,5 +720,19 @@
pFuncList->pfGetChromaPred[C_PRED_P] = WelsIChromaPredPlane_sse2;
}
#endif
+
+#if defined(HAVE_MMI)
+ if (kuiCpuFlag & WELS_CPU_MMI) {
+ pFuncList->pfGetLumaI16x16Pred[I16_PRED_V] = WelsI16x16LumaPredV_mmi;
+ pFuncList->pfGetLumaI16x16Pred[I16_PRED_H] = WelsI16x16LumaPredH_mmi;
+ pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] = WelsI16x16LumaPredDc_mmi;
+ pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] = WelsI16x16LumaPredPlane_mmi;
+
+ pFuncList->pfGetChromaPred[C_PRED_H] = WelsIChromaPredH_mmi;
+ pFuncList->pfGetChromaPred[C_PRED_DC] = WelsIChromaPredDc_mmi;
+ pFuncList->pfGetChromaPred[C_PRED_V] = WelsIChromaPredV_mmi;
+ pFuncList->pfGetChromaPred[C_PRED_P] = WelsIChromaPredPlane_mmi;
+ }
+#endif//HAVE_MMI
}
}
--- a/codec/encoder/core/src/sample.cpp
+++ b/codec/encoder/core/src/sample.cpp
@@ -469,6 +469,27 @@
pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_AArch64_neon;
}
#endif
+
+#if defined (HAVE_MMI)
+ if (uiCpuFlag & WELS_CPU_MMI) {
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_mmi;
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_mmi;
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16] = WelsSampleSad8x16_mmi;
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] = WelsSampleSad8x8_mmi;
+ pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4 ] = WelsSampleSad4x4_mmi;
+
+ pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4 ] = WelsSampleSatd4x4_mmi;
+ pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8 ] = WelsSampleSatd8x8_mmi;
+ pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_mmi;
+ pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_mmi;
+ pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_mmi;
+
+ pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_mmi;
+ pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_mmi;
+ pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_mmi;
+ pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_mmi;
+ }
+#endif//HAVE_MMI
}
} // namespace WelsEnc
--- a/test/encoder/EncUT_EncoderMbAux.cpp
+++ b/test/encoder/EncUT_EncoderMbAux.cpp
@@ -271,6 +271,11 @@
GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16NotAligned_sse2);
GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16_sse2);
#endif
+#ifdef HAVE_MMI
+GENERATE_UT_FOR_COPY (16, 8, WelsCopy16x8NotAligned_mmi);
+GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16NotAligned_mmi);
+GENERATE_UT_FOR_COPY (16, 16, WelsCopy16x16_mmi);
+#endif
namespace {
--- a/test/encoder/EncUT_Sample.cpp
+++ b/test/encoder/EncUT_Sample.cpp
@@ -672,6 +672,20 @@
GENERATE_Sad16x16_UT (WelsSampleSatd16x16_AArch64_neon, WelsSampleSatd16x16_c, WELS_CPU_NEON)
#endif
+#ifdef HAVE_MMI
+GENERATE_Sad4x4_UT (WelsSampleSad4x4_mmi, WelsSampleSad4x4_c, WELS_CPU_MMI)
+GENERATE_Sad8x8_UT (WelsSampleSad8x8_mmi, WelsSampleSad8x8_c, WELS_CPU_MMI)
+GENERATE_Sad8x16_UT (WelsSampleSad8x16_mmi, WelsSampleSad8x16_c, WELS_CPU_MMI)
+GENERATE_Sad16x8_UT (WelsSampleSad16x8_mmi, WelsSampleSad16x8_c, WELS_CPU_MMI)
+GENERATE_Sad16x16_UT (WelsSampleSad16x16_mmi, WelsSampleSad16x16_c, WELS_CPU_MMI)
+
+GENERATE_Sad4x4_UT (WelsSampleSatd4x4_mmi, WelsSampleSatd4x4_c, WELS_CPU_MMI)
+GENERATE_Sad8x8_UT (WelsSampleSatd8x8_mmi, WelsSampleSatd8x8_c, WELS_CPU_MMI)
+GENERATE_Sad8x16_UT (WelsSampleSatd8x16_mmi, WelsSampleSatd8x16_c, WELS_CPU_MMI)
+GENERATE_Sad16x8_UT (WelsSampleSatd16x8_mmi, WelsSampleSatd16x8_c, WELS_CPU_MMI)
+GENERATE_Sad16x16_UT (WelsSampleSatd16x16_mmi, WelsSampleSatd16x16_c, WELS_CPU_MMI)
+#endif
+
#define GENERATE_SadFour_UT(func, CPUFLAGS, width, height) \
TEST_F (SadSatdAssemblyFuncTest, func) { \
if (0 == (m_uiCpuFeatureFlag & CPUFLAGS)) \
@@ -719,4 +733,11 @@
GENERATE_SadFour_UT (WelsSampleSadFour8x16_AArch64_neon, WELS_CPU_NEON, 8, 16)
GENERATE_SadFour_UT (WelsSampleSadFour16x8_AArch64_neon, WELS_CPU_NEON, 16, 8)
GENERATE_SadFour_UT (WelsSampleSadFour16x16_AArch64_neon, WELS_CPU_NEON, 16, 16)
+#endif
+
+#ifdef HAVE_MMI
+GENERATE_SadFour_UT (WelsSampleSadFour8x8_mmi, WELS_CPU_MMI, 8, 8)
+GENERATE_SadFour_UT (WelsSampleSadFour8x16_mmi, WELS_CPU_MMI, 8, 16)
+GENERATE_SadFour_UT (WelsSampleSadFour16x8_mmi, WELS_CPU_MMI, 16, 8)
+GENERATE_SadFour_UT (WelsSampleSadFour16x16_mmi, WELS_CPU_MMI, 16, 16)
#endif