shithub: libvpx

Download patch

ref: ebf7466cd8b884fd29be42ebe670317f5a7ca04d
parent: cf1c0ebc3a56ecb8f0b9c7ee5a0de8da00d70b93
author: Parag Salasakar <[email protected]>
date: Tue Jun 2 06:29:56 EDT 2015

mips msa vp9 updated convolve horiz, vert, hv, copy, avg module

Updated sources according to improved version of common MSA macros.
Enabled respective convolve MSA hooks and tests.
Overall, this is just upgrading the code with styling changes.

Change-Id: If5ad6ef8ea7ca47feed6d2fc9f34f0f0e8b6694d

--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -1815,8 +1815,7 @@
     make_tuple(64, 64, &convolve8_dspr2)));
 #endif
 
-#if 0  // HAVE_MSA
-// TODO(parag): enable when function hooks are added
+#if HAVE_MSA
 const ConvolveFunctions convolve8_msa(
     vp9_convolve_copy_msa, vp9_convolve_avg_msa,
     vp9_convolve8_horiz_msa, vp9_convolve8_avg_horiz_c,
--- a/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c
+++ b/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c
@@ -14,37 +14,29 @@
 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
                                  uint8_t *dst, int32_t dst_stride,
                                  int8_t *filter) {
-  v16i8 filt0, filt1, filt2, filt3;
-  v16i8 src0, src1, src2, src3;
-  v16u8 mask0, mask1, mask2, mask3;
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
   v8i16 filt, out0, out1;
 
-  mask0 = LOAD_UB(&mc_filt_mask_arr[16]);
-
+  mask0 = LD_UB(&mc_filt_mask_arr[16]);
   src -= 3;
 
   /* rearranging filter */
-  filt = LOAD_SH(filter);
-  filt0 = (v16i8)__msa_splati_h(filt, 0);
-  filt1 = (v16i8)__msa_splati_h(filt, 1);
-  filt2 = (v16i8)__msa_splati_h(filt, 2);
-  filt3 = (v16i8)__msa_splati_h(filt, 3);
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
   mask1 = mask0 + 2;
   mask2 = mask0 + 4;
   mask3 = mask0 + 6;
 
-  LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
-
-  XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
-
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
   HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
                              filt0, filt1, filt2, filt3, out0, out1);
-
-  out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
-  out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
-
-  PCKEV_2B_XORI128_STORE_4_BYTES_4(out0, out1, dst, dst_stride);
+  SRARI_H2_SH(out0, out1, FILTER_BITS);
+  SAT_SH2_SH(out0, out1, 7);
+  out = PCKEV_XORI128_UB(out0, out1);
+  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
@@ -52,47 +44,36 @@
                                  int8_t *filter) {
   v16i8 filt0, filt1, filt2, filt3;
   v16i8 src0, src1, src2, src3;
-  v16u8 mask0, mask1, mask2, mask3;
+  v16u8 mask0, mask1, mask2, mask3, out;
   v8i16 filt, out0, out1, out2, out3;
 
-  mask0 = LOAD_UB(&mc_filt_mask_arr[16]);
-
+  mask0 = LD_UB(&mc_filt_mask_arr[16]);
   src -= 3;
 
   /* rearranging filter */
-  filt = LOAD_SH(filter);
-  filt0 = (v16i8)__msa_splati_h(filt, 0);
-  filt1 = (v16i8)__msa_splati_h(filt, 1);
-  filt2 = (v16i8)__msa_splati_h(filt, 2);
-  filt3 = (v16i8)__msa_splati_h(filt, 3);
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
   mask1 = mask0 + 2;
   mask2 = mask0 + 4;
   mask3 = mask0 + 6;
 
-  LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
   src += (4 * src_stride);
-
-  XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
-
   HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
                              filt0, filt1, filt2, filt3, out0, out1);
-
-  LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
-
-  XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
-
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
   HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
                              filt0, filt1, filt2, filt3, out2, out3);
-
-  out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
-  out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
-  out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
-  out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
-
-  PCKEV_2B_XORI128_STORE_4_BYTES_4(out0, out1, dst, dst_stride);
+  SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+  SAT_SH4_SH(out0, out1, out2, out3, 7);
+  out = PCKEV_XORI128_UB(out0, out1);
+  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
   dst += (4 * dst_stride);
-  PCKEV_2B_XORI128_STORE_4_BYTES_4(out2, out3, dst, dst_stride);
+  out = PCKEV_XORI128_UB(out2, out3);
+  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
@@ -108,40 +89,31 @@
 static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
                                  uint8_t *dst, int32_t dst_stride,
                                  int8_t *filter) {
-  v16i8 filt0, filt1, filt2, filt3;
-  v16i8 src0, src1, src2, src3;
-  v16u8 mask0, mask1, mask2, mask3;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
   v8i16 filt, out0, out1, out2, out3;
 
-  mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
-
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
   src -= 3;
 
   /* rearranging filter */
-  filt = LOAD_SH(filter);
-  filt0 = (v16i8)__msa_splati_h(filt, 0);
-  filt1 = (v16i8)__msa_splati_h(filt, 1);
-  filt2 = (v16i8)__msa_splati_h(filt, 2);
-  filt3 = (v16i8)__msa_splati_h(filt, 3);
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
   mask1 = mask0 + 2;
   mask2 = mask0 + 4;
   mask3 = mask0 + 6;
 
-  LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
-
-  XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
-
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
   HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
                              filt0, filt1, filt2, filt3, out0, out1, out2,
                              out3);
-
-  out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
-  out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
-  out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
-  out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
-
-  PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0, out1, out2, out3, dst, dst_stride);
+  SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+  SAT_SH4_SH(out0, out1, out2, out3, 7);
+  tmp0 = PCKEV_XORI128_UB(out0, out1);
+  tmp1 = PCKEV_XORI128_UB(out2, out3);
+  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
 }
 
 static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
@@ -148,21 +120,16 @@
                                      uint8_t *dst, int32_t dst_stride,
                                      int8_t *filter, int32_t height) {
   uint32_t loop_cnt;
-  v16i8 filt0, filt1, filt2, filt3;
-  v16i8 src0, src1, src2, src3;
-  v16u8 mask0, mask1, mask2, mask3;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
   v8i16 filt, out0, out1, out2, out3;
 
-  mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
-
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
   src -= 3;
 
   /* rearranging filter */
-  filt = LOAD_SH(filter);
-  filt0 = (v16i8)__msa_splati_h(filt, 0);
-  filt1 = (v16i8)__msa_splati_h(filt, 1);
-  filt2 = (v16i8)__msa_splati_h(filt, 2);
-  filt3 = (v16i8)__msa_splati_h(filt, 3);
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
   mask1 = mask0 + 2;
   mask2 = mask0 + 4;
@@ -169,21 +136,17 @@
   mask3 = mask0 + 6;
 
   for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
     src += (4 * src_stride);
-
-    XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
-
     HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
                                mask3, filt0, filt1, filt2, filt3, out0, out1,
                                out2, out3);
-
-    out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
-    out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
-    out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
-    out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
-
-    PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0, out1, out2, out3, dst, dst_stride);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    tmp0 = PCKEV_XORI128_UB(out0, out1);
+    tmp1 = PCKEV_XORI128_UB(out2, out3);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
     dst += (4 * dst_stride);
   }
 }
@@ -202,21 +165,16 @@
                                  uint8_t *dst, int32_t dst_stride,
                                  int8_t *filter, int32_t height) {
   uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3;
-  v16i8 filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, out;
   v8i16 filt, out0, out1, out2, out3;
 
-  mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
-
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
   src -= 3;
 
   /* rearranging filter */
-  filt = LOAD_SH(filter);
-  filt0 = (v16i8)__msa_splati_h(filt, 0);
-  filt1 = (v16i8)__msa_splati_h(filt, 1);
-  filt2 = (v16i8)__msa_splati_h(filt, 2);
-  filt3 = (v16i8)__msa_splati_h(filt, 3);
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
   mask1 = mask0 + 2;
   mask2 = mask0 + 4;
@@ -223,27 +181,20 @@
   mask3 = mask0 + 6;
 
   for (loop_cnt = (height >> 1); loop_cnt--;) {
-    src0 = LOAD_SB(src);
-    src1 = LOAD_SB(src + 8);
-    src += src_stride;
-    src2 = LOAD_SB(src);
-    src3 = LOAD_SB(src + 8);
-    src += src_stride;
-
-    XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
-
+    LD_SB2(src, src_stride, src0, src2);
+    LD_SB2(src + 8, src_stride, src1, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (2 * src_stride);
     HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
                                mask3, filt0, filt1, filt2, filt3, out0, out1,
                                out2, out3);
-
-    out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
-    out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
-    out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
-    out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
-
-    PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst);
     dst += dst_stride;
-    PCKEV_B_XORI128_STORE_VEC(out3, out2, dst);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst);
     dst += dst_stride;
   }
 }
@@ -252,21 +203,16 @@
                                  uint8_t *dst, int32_t dst_stride,
                                  int8_t *filter, int32_t height) {
   uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3;
-  v16i8 filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, out;
   v8i16 filt, out0, out1, out2, out3;
 
-  mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
-
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
   src -= 3;
 
   /* rearranging filter */
-  filt = LOAD_SH(filter);
-  filt0 = (v16i8)__msa_splati_h(filt, 0);
-  filt1 = (v16i8)__msa_splati_h(filt, 1);
-  filt2 = (v16i8)__msa_splati_h(filt, 2);
-  filt3 = (v16i8)__msa_splati_h(filt, 3);
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
   mask1 = mask0 + 2;
   mask2 = mask0 + 4;
@@ -273,47 +219,40 @@
   mask3 = mask0 + 6;
 
   for (loop_cnt = (height >> 1); loop_cnt--;) {
-    src0 = LOAD_SB(src);
-    src2 = LOAD_SB(src + 16);
-    src3 = LOAD_SB(src + 24);
-    src1 = __msa_sld_b((v16i8)src2, (v16i8)src0, 8);
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
     src += src_stride;
-
-    XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
-
+    XORI_B4_128_SB(src0, src1, src2, src3);
     HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
                                mask3, filt0, filt1, filt2, filt3, out0, out1,
                                out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
 
-    out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
-    out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
-    out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
-    out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
 
-    src0 = LOAD_SB(src);
-    src2 = LOAD_SB(src + 16);
-    src3 = LOAD_SB(src + 24);
-    src1 = __msa_sld_b((v16i8)src2, (v16i8)src0, 8);
-
-    PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
-    PCKEV_B_XORI128_STORE_VEC(out3, out2, (dst + 16));
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst + 16);
     dst += dst_stride;
 
-    XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
-
+    XORI_B4_128_SB(src0, src1, src2, src3);
     HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
                                mask3, filt0, filt1, filt2, filt3, out0, out1,
                                out2, out3);
-
-    out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
-    out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
-    out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
-    out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
-
-    PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
-    PCKEV_B_XORI128_STORE_VEC(out3, out2, (dst + 16));
-
-    src += src_stride;
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst + 16);
     dst += dst_stride;
   }
 }
@@ -321,22 +260,17 @@
 static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
                                  uint8_t *dst, int32_t dst_stride,
                                  int8_t *filter, int32_t height) {
-  uint32_t loop_cnt, cnt;
-  v16i8 src0, src1, src2, src3;
-  v16i8 filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3;
+  int32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, out;
   v8i16 filt, out0, out1, out2, out3;
 
-  mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
-
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
   src -= 3;
 
   /* rearranging filter */
-  filt = LOAD_SH(filter);
-  filt0 = (v16i8)__msa_splati_h(filt, 0);
-  filt1 = (v16i8)__msa_splati_h(filt, 1);
-  filt2 = (v16i8)__msa_splati_h(filt, 2);
-  filt3 = (v16i8)__msa_splati_h(filt, 3);
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
   mask1 = mask0 + 2;
   mask2 = mask0 + 4;
@@ -343,28 +277,38 @@
   mask3 = mask0 + 6;
 
   for (loop_cnt = height; loop_cnt--;) {
-    for (cnt = 0; cnt < 2; ++cnt) {
-      src0 = LOAD_SB(&src[cnt << 5]);
-      src2 = LOAD_SB(&src[16 + (cnt << 5)]);
-      src3 = LOAD_SB(&src[24 + (cnt << 5)]);
-      src1 = __msa_sld_b((v16i8)src2, (v16i8)src0, 8);
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
 
-      XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst + 16);
 
-      HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
-                                 mask3, filt0, filt1, filt2, filt3, out0, out1,
-                                 out2, out3);
-
-      out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
-      out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
-      out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
-      out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
-
-      PCKEV_B_XORI128_STORE_VEC(out1, out0, &dst[cnt << 5]);
-      PCKEV_B_XORI128_STORE_VEC(out3, out2, &dst[16 + (cnt << 5)]);
-    }
-
+    src0 = LD_SB(src + 32);
+    src2 = LD_SB(src + 48);
+    src3 = LD_SB(src + 56);
+    src1 = __msa_sldi_b(src2, src0, 8);
     src += src_stride;
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst + 32);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst + 48);
     dst += dst_stride;
   }
 }
@@ -372,124 +316,55 @@
 static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
                                  uint8_t *dst, int32_t dst_stride,
                                  int8_t *filter) {
-  uint32_t out0, out1, out2, out3;
   v16i8 src0, src1, src2, src3, mask;
-  v16u8 vec0, vec1, filt0;
-  v16i8 res0, res1;
+  v16u8 filt0, vec0, vec1, res0, res1;
   v8u16 vec2, vec3, filt, const255;
 
-  mask = LOAD_SB(&mc_filt_mask_arr[16]);
+  mask = LD_SB(&mc_filt_mask_arr[16]);
 
   /* rearranging filter */
-  filt = LOAD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
 
-  const255 = (v8u16)__msa_ldi_h(255);
+  const255 = (v8u16) __msa_ldi_h(255);
 
-  LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
-
-  vec0 = (v16u8)__msa_vshf_b(mask, src1, src0);
-  vec1 = (v16u8)__msa_vshf_b(mask, src3, src2);
-
-  vec2 = __msa_dotp_u_h(vec0, filt0);
-  vec3 = __msa_dotp_u_h(vec1, filt0);
-
-  vec2 = (v8u16)__msa_srari_h((v8i16)vec2, FILTER_BITS);
-  vec3 = (v8u16)__msa_srari_h((v8i16)vec3, FILTER_BITS);
-
-  vec2 = __msa_min_u_h(vec2, const255);
-  vec3 = __msa_min_u_h(vec3, const255);
-
-  res0 = __msa_pckev_b((v16i8)vec2, (v16i8)vec2);
-  res1 = __msa_pckev_b((v16i8)vec3, (v16i8)vec3);
-
-  out0 = __msa_copy_u_w((v4i32)res0, 0);
-  out1 = __msa_copy_u_w((v4i32)res0, 1);
-  out2 = __msa_copy_u_w((v4i32)res1, 0);
-  out3 = __msa_copy_u_w((v4i32)res1, 1);
-
-  STORE_WORD(dst, out0);
-  dst += dst_stride;
-  STORE_WORD(dst, out1);
-  dst += dst_stride;
-  STORE_WORD(dst, out2);
-  dst += dst_stride;
-  STORE_WORD(dst, out3);
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+  SRARI_H2_UH(vec2, vec3, FILTER_BITS);
+  MIN_UH2_UH(vec2, vec3, const255);
+  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 }
 
 static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
                                  uint8_t *dst, int32_t dst_stride,
                                  int8_t *filter) {
-  uint32_t out0, out1, out2, out3;
-  v16u8 filt0;
+  v16u8 vec0, vec1, vec2, vec3, filt0;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 vec0, vec1, vec2, vec3;
-  v8u16 vec4, vec5, vec6, vec7;
   v16i8 res0, res1, res2, res3;
-  v8u16 filt, const255;
+  v8u16 vec4, vec5, vec6, vec7, filt, const255;
 
-  mask = LOAD_SB(&mc_filt_mask_arr[16]);
+  mask = LD_SB(&mc_filt_mask_arr[16]);
 
   /* rearranging filter */
-  filt = LOAD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
 
-  const255 = (v8u16)__msa_ldi_h(255);
+  const255 = (v8u16) __msa_ldi_h(255);
 
-  LOAD_8VECS_SB(src, src_stride,
-                src0, src1, src2, src3, src4, src5, src6, src7);
-
-  vec0 = (v16u8)__msa_vshf_b(mask, src1, src0);
-  vec1 = (v16u8)__msa_vshf_b(mask, src3, src2);
-  vec2 = (v16u8)__msa_vshf_b(mask, src5, src4);
-  vec3 = (v16u8)__msa_vshf_b(mask, src7, src6);
-
-  vec4 = __msa_dotp_u_h(vec0, filt0);
-  vec5 = __msa_dotp_u_h(vec1, filt0);
-  vec6 = __msa_dotp_u_h(vec2, filt0);
-  vec7 = __msa_dotp_u_h(vec3, filt0);
-
-  vec4 = (v8u16)__msa_srari_h((v8i16)vec4, FILTER_BITS);
-  vec5 = (v8u16)__msa_srari_h((v8i16)vec5, FILTER_BITS);
-  vec6 = (v8u16)__msa_srari_h((v8i16)vec6, FILTER_BITS);
-  vec7 = (v8u16)__msa_srari_h((v8i16)vec7, FILTER_BITS);
-
-  vec4 = __msa_min_u_h(vec4, const255);
-  vec5 = __msa_min_u_h(vec5, const255);
-  vec6 = __msa_min_u_h(vec6, const255);
-  vec7 = __msa_min_u_h(vec7, const255);
-
-  res0 = __msa_pckev_b((v16i8)vec4, (v16i8)vec4);
-  res1 = __msa_pckev_b((v16i8)vec5, (v16i8)vec5);
-  res2 = __msa_pckev_b((v16i8)vec6, (v16i8)vec6);
-  res3 = __msa_pckev_b((v16i8)vec7, (v16i8)vec7);
-
-  out0 = __msa_copy_u_w((v4i32)res0, 0);
-  out1 = __msa_copy_u_w((v4i32)res0, 1);
-  out2 = __msa_copy_u_w((v4i32)res1, 0);
-  out3 = __msa_copy_u_w((v4i32)res1, 1);
-
-  STORE_WORD(dst, out0);
-  dst += dst_stride;
-  STORE_WORD(dst, out1);
-  dst += dst_stride;
-  STORE_WORD(dst, out2);
-  dst += dst_stride;
-  STORE_WORD(dst, out3);
-  dst += dst_stride;
-
-  out0 = __msa_copy_u_w((v4i32)res2, 0);
-  out1 = __msa_copy_u_w((v4i32)res2, 1);
-  out2 = __msa_copy_u_w((v4i32)res3, 0);
-  out3 = __msa_copy_u_w((v4i32)res3, 1);
-
-  STORE_WORD(dst, out0);
-  dst += dst_stride;
-  STORE_WORD(dst, out1);
-  dst += dst_stride;
-  STORE_WORD(dst, out2);
-  dst += dst_stride;
-  STORE_WORD(dst, out3);
+  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
+              vec6, vec7);
+  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
+  MIN_UH4_UH(vec4, vec5, vec6, vec7, const255);
+  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
+              res2, res3);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+  dst += (4 * dst_stride);
+  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
 }
 
 static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
@@ -507,38 +382,25 @@
                                  int8_t *filter) {
   v16u8 filt0;
   v16i8 src0, src1, src2, src3, mask;
-  v8u16 vec0, vec1, vec2, vec3;
-  v8u16 out0, out1, out2, out3;
-  v8u16 const255, filt;
+  v8u16 vec0, vec1, vec2, vec3, const255, filt;
 
-  mask = LOAD_SB(&mc_filt_mask_arr[0]);
+  mask = LD_SB(&mc_filt_mask_arr[0]);
 
   /* rearranging filter */
-  filt = LOAD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
 
-  const255 = (v8u16)__msa_ldi_h(255);
+  const255 = (v8u16) __msa_ldi_h(255);
 
-  LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
-
-  vec0 = (v8u16)__msa_vshf_b(mask, src0, src0);
-  vec1 = (v8u16)__msa_vshf_b(mask, src1, src1);
-  vec2 = (v8u16)__msa_vshf_b(mask, src2, src2);
-  vec3 = (v8u16)__msa_vshf_b(mask, src3, src3);
-
-  vec0 = __msa_dotp_u_h((v16u8)vec0, filt0);
-  vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
-  vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
-  vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
-
-  SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3, FILTER_BITS);
-
-  out0 = __msa_min_u_h(vec0, const255);
-  out1 = __msa_min_u_h(vec1, const255);
-  out2 = __msa_min_u_h(vec2, const255);
-  out3 = __msa_min_u_h(vec3, const255);
-
-  PCKEV_B_STORE_8_BYTES_4(out0, out1, out2, out3, dst, dst_stride);
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+  MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
+  PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
+  ST8x4_UB(src0, src1, dst, dst_stride);
 }
 
 static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
@@ -545,111 +407,68 @@
                                      uint8_t *dst, int32_t dst_stride,
                                      int8_t *filter, int32_t height) {
   v16u8 filt0;
-  v16i8 src0, src1, src2, src3, mask;
-  v8u16 vec0, vec1, vec2, vec3;
-  v8u16 filt, const255;
+  v16i8 src0, src1, src2, src3, mask, out0, out1;
+  v8u16 vec0, vec1, vec2, vec3, filt, const255;
 
-  mask = LOAD_SB(&mc_filt_mask_arr[0]);
+  mask = LD_SB(&mc_filt_mask_arr[0]);
 
   /* rearranging filter */
-  filt = LOAD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
 
-  const255 = (v8u16)__msa_ldi_h(255);
+  const255 = (v8u16) __msa_ldi_h(255);
 
-  LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
   src += (4 * src_stride);
 
-  vec0 = (v8u16)__msa_vshf_b(mask, src0, src0);
-  vec1 = (v8u16)__msa_vshf_b(mask, src1, src1);
-  vec2 = (v8u16)__msa_vshf_b(mask, src2, src2);
-  vec3 = (v8u16)__msa_vshf_b(mask, src3, src3);
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+  MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
 
-  vec0 = __msa_dotp_u_h((v16u8)vec0, filt0);
-  vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
-  vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
-  vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
-
-  SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3, FILTER_BITS);
-
-  vec0 = __msa_min_u_h(vec0, const255);
-  vec1 = __msa_min_u_h(vec1, const255);
-  vec2 = __msa_min_u_h(vec2, const255);
-  vec3 = __msa_min_u_h(vec3, const255);
-
-  LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
   src += (4 * src_stride);
 
-  PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride);
+  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+  ST8x4_UB(out0, out1, dst, dst_stride);
   dst += (4 * dst_stride);
 
-  vec0 = (v8u16)__msa_vshf_b(mask, src0, src0);
-  vec1 = (v8u16)__msa_vshf_b(mask, src1, src1);
-  vec2 = (v8u16)__msa_vshf_b(mask, src2, src2);
-  vec3 = (v8u16)__msa_vshf_b(mask, src3, src3);
-
-  vec0 = __msa_dotp_u_h((v16u8)vec0, filt0);
-  vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
-  vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
-  vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
-
-  SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3, FILTER_BITS);
-
-  vec0 = __msa_min_u_h(vec0, const255);
-  vec1 = __msa_min_u_h(vec1, const255);
-  vec2 = __msa_min_u_h(vec2, const255);
-  vec3 = __msa_min_u_h(vec3, const255);
-
-  PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride);
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+  MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
+  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+  ST8x4_UB(out0, out1, dst, dst_stride);
   dst += (4 * dst_stride);
 
   if (16 == height) {
-    LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
     src += (4 * src_stride);
 
-    vec0 = (v8u16)__msa_vshf_b(mask, src0, src0);
-    vec1 = (v8u16)__msa_vshf_b(mask, src1, src1);
-    vec2 = (v8u16)__msa_vshf_b(mask, src2, src2);
-    vec3 = (v8u16)__msa_vshf_b(mask, src3, src3);
-
-    vec0 = __msa_dotp_u_h((v16u8)vec0, filt0);
-    vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
-    vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
-    vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
-
-    SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3,
-                     vec0, vec1, vec2, vec3, FILTER_BITS);
-
-    vec0 = __msa_min_u_h(vec0, const255);
-    vec1 = __msa_min_u_h(vec1, const255);
-    vec2 = __msa_min_u_h(vec2, const255);
-    vec3 = __msa_min_u_h(vec3, const255);
-
-    LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
     src += (4 * src_stride);
 
-    PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride);
-    dst += (4 * dst_stride);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
 
-    vec0 = (v8u16)__msa_vshf_b(mask, src0, src0);
-    vec1 = (v8u16)__msa_vshf_b(mask, src1, src1);
-    vec2 = (v8u16)__msa_vshf_b(mask, src2, src2);
-    vec3 = (v8u16)__msa_vshf_b(mask, src3, src3);
-
-    vec0 = __msa_dotp_u_h((v16u8)vec0, filt0);
-    vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
-    vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
-    vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
-
-    SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3,
-                     vec0, vec1, vec2, vec3, FILTER_BITS);
-
-    vec0 = __msa_min_u_h(vec0, const255);
-    vec1 = __msa_min_u_h(vec1, const255);
-    vec2 = __msa_min_u_h(vec2, const255);
-    vec3 = __msa_min_u_h(vec3, const255);
-
-    PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
   }
 }
 
@@ -668,136 +487,68 @@
                                  int8_t *filter, int32_t height) {
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt0;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
-  v8u16 filt, const255;
+  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255;
 
-  mask = LOAD_SB(&mc_filt_mask_arr[0]);
+  mask = LD_SB(&mc_filt_mask_arr[0]);
 
   loop_cnt = (height >> 2) - 1;
 
   /* rearranging filter */
-  filt = LOAD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
 
-  const255 = (v8u16)__msa_ldi_h(255);
+  const255 = (v8u16) __msa_ldi_h(255);
 
-  src0 = LOAD_SB(src);
-  src1 = LOAD_SB(src + 8);
-  src += src_stride;
-  src2 = LOAD_SB(src);
-  src3 = LOAD_SB(src + 8);
-  src += src_stride;
-  src4 = LOAD_SB(src);
-  src5 = LOAD_SB(src + 8);
-  src += src_stride;
-  src6 = LOAD_SB(src);
-  src7 = LOAD_SB(src + 8);
-  src += src_stride;
+  LD_SB4(src, src_stride, src0, src2, src4, src6);
+  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+  src += (4 * src_stride);
 
-  vec0 = (v16u8)__msa_vshf_b(mask, src0, src0);
-  vec1 = (v16u8)__msa_vshf_b(mask, src1, src1);
-  vec2 = (v16u8)__msa_vshf_b(mask, src2, src2);
-  vec3 = (v16u8)__msa_vshf_b(mask, src3, src3);
-  vec4 = (v16u8)__msa_vshf_b(mask, src4, src4);
-  vec5 = (v16u8)__msa_vshf_b(mask, src5, src5);
-  vec6 = (v16u8)__msa_vshf_b(mask, src6, src6);
-  vec7 = (v16u8)__msa_vshf_b(mask, src7, src7);
-
-  out0 = __msa_dotp_u_h(vec0, filt0);
-  out1 = __msa_dotp_u_h(vec1, filt0);
-  out2 = __msa_dotp_u_h(vec2, filt0);
-  out3 = __msa_dotp_u_h(vec3, filt0);
-  out4 = __msa_dotp_u_h(vec4, filt0);
-  out5 = __msa_dotp_u_h(vec5, filt0);
-  out6 = __msa_dotp_u_h(vec6, filt0);
-  out7 = __msa_dotp_u_h(vec7, filt0);
-
-  out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS);
-  out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS);
-  out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS);
-  out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS);
-  out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS);
-  out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS);
-  out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS);
-  out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS);
-
-  out0 = __msa_min_u_h(out0, const255);
-  out1 = __msa_min_u_h(out1, const255);
-  out2 = __msa_min_u_h(out2, const255);
-  out3 = __msa_min_u_h(out3, const255);
-  out4 = __msa_min_u_h(out4, const255);
-  out5 = __msa_min_u_h(out5, const255);
-  out6 = __msa_min_u_h(out6, const255);
-  out7 = __msa_min_u_h(out7, const255);
-
-  PCKEV_B_STORE_VEC(out1, out0, dst);
+  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+              out2, out3);
+  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+              out6, out7);
+  SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+  SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+  MIN_UH4_UH(out0, out1, out2, out3, const255);
+  MIN_UH4_UH(out4, out5, out6, out7, const255);
+  PCKEV_ST_SB(out0, out1, dst);
   dst += dst_stride;
-  PCKEV_B_STORE_VEC(out3, out2, dst);
+  PCKEV_ST_SB(out2, out3, dst);
   dst += dst_stride;
-  PCKEV_B_STORE_VEC(out5, out4, dst);
+  PCKEV_ST_SB(out4, out5, dst);
   dst += dst_stride;
-  PCKEV_B_STORE_VEC(out7, out6, dst);
+  PCKEV_ST_SB(out6, out7, dst);
   dst += dst_stride;
 
   for (; loop_cnt--;) {
-    src0 = LOAD_SB(src);
-    src1 = LOAD_SB(src + 8);
-    src += src_stride;
-    src2 = LOAD_SB(src);
-    src3 = LOAD_SB(src + 8);
-    src += src_stride;
-    src4 = LOAD_SB(src);
-    src5 = LOAD_SB(src + 8);
-    src += src_stride;
-    src6 = LOAD_SB(src);
-    src7 = LOAD_SB(src + 8);
-    src += src_stride;
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
 
-    vec0 = (v16u8)__msa_vshf_b(mask, src0, src0);
-    vec1 = (v16u8)__msa_vshf_b(mask, src1, src1);
-    vec2 = (v16u8)__msa_vshf_b(mask, src2, src2);
-    vec3 = (v16u8)__msa_vshf_b(mask, src3, src3);
-    vec4 = (v16u8)__msa_vshf_b(mask, src4, src4);
-    vec5 = (v16u8)__msa_vshf_b(mask, src5, src5);
-    vec6 = (v16u8)__msa_vshf_b(mask, src6, src6);
-    vec7 = (v16u8)__msa_vshf_b(mask, src7, src7);
-
-    out0 = __msa_dotp_u_h(vec0, filt0);
-    out1 = __msa_dotp_u_h(vec1, filt0);
-    out2 = __msa_dotp_u_h(vec2, filt0);
-    out3 = __msa_dotp_u_h(vec3, filt0);
-    out4 = __msa_dotp_u_h(vec4, filt0);
-    out5 = __msa_dotp_u_h(vec5, filt0);
-    out6 = __msa_dotp_u_h(vec6, filt0);
-    out7 = __msa_dotp_u_h(vec7, filt0);
-
-    out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS);
-    out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS);
-    out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS);
-    out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS);
-    out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS);
-    out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS);
-    out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS);
-    out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS);
-
-    out0 = __msa_min_u_h(out0, const255);
-    out1 = __msa_min_u_h(out1, const255);
-    out2 = __msa_min_u_h(out2, const255);
-    out3 = __msa_min_u_h(out3, const255);
-    out4 = __msa_min_u_h(out4, const255);
-    out5 = __msa_min_u_h(out5, const255);
-    out6 = __msa_min_u_h(out6, const255);
-    out7 = __msa_min_u_h(out7, const255);
-
-    PCKEV_B_STORE_VEC(out1, out0, dst);
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    MIN_UH4_UH(out0, out1, out2, out3, const255);
+    MIN_UH4_UH(out4, out5, out6, out7, const255);
+    PCKEV_ST_SB(out0, out1, dst);
     dst += dst_stride;
-    PCKEV_B_STORE_VEC(out3, out2, dst);
+    PCKEV_ST_SB(out2, out3, dst);
     dst += dst_stride;
-    PCKEV_B_STORE_VEC(out5, out4, dst);
+    PCKEV_ST_SB(out4, out5, dst);
     dst += dst_stride;
-    PCKEV_B_STORE_VEC(out7, out6, dst);
+    PCKEV_ST_SB(out6, out7, dst);
     dst += dst_stride;
   }
 }
@@ -807,72 +558,46 @@
                                  int8_t *filter, int32_t height) {
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt0;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
-  v8u16 filt, const255;
+  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255;
 
-  mask = LOAD_SB(&mc_filt_mask_arr[0]);
+  mask = LD_SB(&mc_filt_mask_arr[0]);
 
   /* rearranging filter */
-  filt = LOAD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
 
-  const255 = (v8u16)__msa_ldi_h(255);
+  const255 = (v8u16) __msa_ldi_h(255);
 
   for (loop_cnt = height >> 1; loop_cnt--;) {
-    src0 = LOAD_SB(src);
-    src2 = LOAD_SB(src + 16);
-    src3 = LOAD_SB(src + 24);
-    src1 = __msa_sld_b(src2, src0, 8);
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
     src += src_stride;
-    src4 = LOAD_SB(src);
-    src6 = LOAD_SB(src + 16);
-    src7 = LOAD_SB(src + 24);
-    src5 = __msa_sld_b(src6, src4, 8);
+    src4 = LD_SB(src);
+    src6 = LD_SB(src + 16);
+    src7 = LD_SB(src + 24);
+    src5 = __msa_sldi_b(src6, src4, 8);
     src += src_stride;
 
-    vec0 = (v16u8)__msa_vshf_b(mask, src0, src0);
-    vec1 = (v16u8)__msa_vshf_b(mask, src1, src1);
-    vec2 = (v16u8)__msa_vshf_b(mask, src2, src2);
-    vec3 = (v16u8)__msa_vshf_b(mask, src3, src3);
-    vec4 = (v16u8)__msa_vshf_b(mask, src4, src4);
-    vec5 = (v16u8)__msa_vshf_b(mask, src5, src5);
-    vec6 = (v16u8)__msa_vshf_b(mask, src6, src6);
-    vec7 = (v16u8)__msa_vshf_b(mask, src7, src7);
-
-    out0 = __msa_dotp_u_h(vec0, filt0);
-    out1 = __msa_dotp_u_h(vec1, filt0);
-    out2 = __msa_dotp_u_h(vec2, filt0);
-    out3 = __msa_dotp_u_h(vec3, filt0);
-    out4 = __msa_dotp_u_h(vec4, filt0);
-    out5 = __msa_dotp_u_h(vec5, filt0);
-    out6 = __msa_dotp_u_h(vec6, filt0);
-    out7 = __msa_dotp_u_h(vec7, filt0);
-
-    out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS);
-    out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS);
-    out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS);
-    out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS);
-    out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS);
-    out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS);
-    out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS);
-    out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS);
-
-    out0 = __msa_min_u_h(out0, const255);
-    out1 = __msa_min_u_h(out1, const255);
-    out2 = __msa_min_u_h(out2, const255);
-    out3 = __msa_min_u_h(out3, const255);
-    out4 = __msa_min_u_h(out4, const255);
-    out5 = __msa_min_u_h(out5, const255);
-    out6 = __msa_min_u_h(out6, const255);
-    out7 = __msa_min_u_h(out7, const255);
-
-    PCKEV_B_STORE_VEC(out1, out0, dst);
-    PCKEV_B_STORE_VEC(out3, out2, dst + 16);
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    MIN_UH4_UH(out0, out1, out2, out3, const255);
+    MIN_UH4_UH(out4, out5, out6, out7, const255);
+    PCKEV_ST_SB(out0, out1, dst);
+    PCKEV_ST_SB(out2, out3, dst + 16);
     dst += dst_stride;
-    PCKEV_B_STORE_VEC(out5, out4, dst);
-    PCKEV_B_STORE_VEC(out7, out6, dst + 16);
+    PCKEV_ST_SB(out4, out5, dst);
+    PCKEV_ST_SB(out6, out7, dst + 16);
     dst += dst_stride;
   }
 }
@@ -882,70 +607,42 @@
                                  int8_t *filter, int32_t height) {
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt0;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
-  v8u16 filt, const255;
+  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255;
 
-  mask = LOAD_SB(&mc_filt_mask_arr[0]);
+  mask = LD_SB(&mc_filt_mask_arr[0]);
 
   /* rearranging filter */
-  filt = LOAD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+  filt = LD_UH(filter);
+  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
 
-  const255 = (v8u16)__msa_ldi_h(255);
+  const255 = (v8u16) __msa_ldi_h(255);
 
   for (loop_cnt = height; loop_cnt--;) {
-    src0 = LOAD_SB(src);
-    src2 = LOAD_SB(src + 16);
-    src4 = LOAD_SB(src + 32);
-    src6 = LOAD_SB(src + 48);
-    src7 = LOAD_SB(src + 56);
-    src1 = __msa_sld_b(src2, src0, 8);
-    src3 = __msa_sld_b(src4, src2, 8);
-    src5 = __msa_sld_b(src6, src4, 8);
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src4 = LD_SB(src + 32);
+    src6 = LD_SB(src + 48);
+    src7 = LD_SB(src + 56);
+    SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
     src += src_stride;
 
-    vec0 = (v16u8)__msa_vshf_b(mask, src0, src0);
-    vec1 = (v16u8)__msa_vshf_b(mask, src1, src1);
-    vec2 = (v16u8)__msa_vshf_b(mask, src2, src2);
-    vec3 = (v16u8)__msa_vshf_b(mask, src3, src3);
-    vec4 = (v16u8)__msa_vshf_b(mask, src4, src4);
-    vec5 = (v16u8)__msa_vshf_b(mask, src5, src5);
-    vec6 = (v16u8)__msa_vshf_b(mask, src6, src6);
-    vec7 = (v16u8)__msa_vshf_b(mask, src7, src7);
-
-    out0 = __msa_dotp_u_h(vec0, filt0);
-    out1 = __msa_dotp_u_h(vec1, filt0);
-    out2 = __msa_dotp_u_h(vec2, filt0);
-    out3 = __msa_dotp_u_h(vec3, filt0);
-    out4 = __msa_dotp_u_h(vec4, filt0);
-    out5 = __msa_dotp_u_h(vec5, filt0);
-    out6 = __msa_dotp_u_h(vec6, filt0);
-    out7 = __msa_dotp_u_h(vec7, filt0);
-
-    out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS);
-    out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS);
-    out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS);
-    out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS);
-    out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS);
-    out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS);
-    out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS);
-    out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS);
-
-    out0 = __msa_min_u_h(out0, const255);
-    out1 = __msa_min_u_h(out1, const255);
-    out2 = __msa_min_u_h(out2, const255);
-    out3 = __msa_min_u_h(out3, const255);
-    out4 = __msa_min_u_h(out4, const255);
-    out5 = __msa_min_u_h(out5, const255);
-    out6 = __msa_min_u_h(out6, const255);
-    out7 = __msa_min_u_h(out7, const255);
-
-    PCKEV_B_STORE_VEC(out1, out0, dst);
-    PCKEV_B_STORE_VEC(out3, out2, dst + 16);
-    PCKEV_B_STORE_VEC(out5, out4, dst + 32);
-    PCKEV_B_STORE_VEC(out7, out6, dst + 48);
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    MIN_UH4_UH(out0, out1, out2, out3, const255);
+    MIN_UH4_UH(out4, out5, out6, out7, const255);
+    PCKEV_ST_SB(out0, out1, dst);
+    PCKEV_ST_SB(out2, out3, dst + 16);
+    PCKEV_ST_SB(out4, out5, dst + 32);
+    PCKEV_ST_SB(out6, out7, dst + 48);
     dst += dst_stride;
   }
 }
--- a/vp9/common/mips/msa/vp9_convolve8_msa.c
+++ b/vp9/common/mips/msa/vp9_convolve8_msa.c
@@ -26,93 +26,68 @@
                                      int32_t height) {
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16i8 filt_horiz0, filt_horiz1, filt_horiz2, filt_horiz3;
-  v16u8 mask0, mask1, mask2, mask3;
-  v8i16 filt_horiz;
-  v8i16 horiz_out0, horiz_out1, horiz_out2, horiz_out3, horiz_out4;
-  v8i16 horiz_out5, horiz_out6, horiz_out7, horiz_out8, horiz_out9;
-  v8i16 tmp0, tmp1, out0, out1, out2, out3, out4;
-  v8i16 filt, filt_vert0, filt_vert1, filt_vert2, filt_vert3;
+  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
+  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
 
-  mask0 = LOAD_UB(&mc_filt_mask_arr[16]);
-
+  mask0 = LD_UB(&mc_filt_mask_arr[16]);
   src -= (3 + 3 * src_stride);
 
   /* rearranging filter */
-  filt_horiz = LOAD_SH(filter_horiz);
-  filt_horiz0 = (v16i8)__msa_splati_h(filt_horiz, 0);
-  filt_horiz1 = (v16i8)__msa_splati_h(filt_horiz, 1);
-  filt_horiz2 = (v16i8)__msa_splati_h(filt_horiz, 2);
-  filt_horiz3 = (v16i8)__msa_splati_h(filt_horiz, 3);
+  filt = LD_SH(filter_horiz);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
   mask1 = mask0 + 2;
   mask2 = mask0 + 4;
   mask3 = mask0 + 6;
 
-  LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
   src += (7 * src_stride);
 
-  XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
-                  src0, src1, src2, src3, src4, src5, src6, 128);
+  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
 
-  horiz_out0 = HORIZ_8TAP_FILT_2VECS(src0, src1, mask0, mask1, mask2, mask3,
-                                     filt_horiz0, filt_horiz1, filt_horiz2,
-                                     filt_horiz3);
-  horiz_out2 = HORIZ_8TAP_FILT_2VECS(src2, src3, mask0, mask1, mask2, mask3,
-                                     filt_horiz0, filt_horiz1, filt_horiz2,
-                                     filt_horiz3);
-  horiz_out4 = HORIZ_8TAP_FILT_2VECS(src4, src5, mask0, mask1, mask2, mask3,
-                                     filt_horiz0, filt_horiz1, filt_horiz2,
-                                     filt_horiz3);
-  horiz_out5 = HORIZ_8TAP_FILT_2VECS(src5, src6, mask0, mask1, mask2, mask3,
-                                     filt_horiz0, filt_horiz1, filt_horiz2,
-                                     filt_horiz3);
-  horiz_out1 = (v8i16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8);
-  horiz_out3 = (v8i16)__msa_sldi_b((v16i8)horiz_out4, (v16i8)horiz_out2, 8);
+  filt = LD_SH(filter_vert);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
 
-  filt = LOAD_SH(filter_vert);
-  filt_vert0 = __msa_splati_h(filt, 0);
-  filt_vert1 = __msa_splati_h(filt, 1);
-  filt_vert2 = __msa_splati_h(filt, 2);
-  filt_vert3 = __msa_splati_h(filt, 3);
+  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+  out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
 
-  out0 = (v8i16)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
-  out1 = (v8i16)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
-  out2 = (v8i16)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4);
-
   for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    XORI_B4_128_SB(src7, src8, src9, src10);
     src += (4 * src_stride);
 
-    XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128);
+    hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
+    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
 
-    horiz_out7 = HORIZ_8TAP_FILT_2VECS(src7, src8, mask0, mask1, mask2, mask3,
-                                       filt_horiz0, filt_horiz1, filt_horiz2,
-                                       filt_horiz3);
-    horiz_out6 = (v8i16)__msa_sldi_b((v16i8)horiz_out7, (v16i8)horiz_out5, 8);
-
-    out3 = (v8i16)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6);
-
-    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vert0, filt_vert1,
-                               filt_vert2, filt_vert3);
-
-    horiz_out9 = HORIZ_8TAP_FILT_2VECS(src9, src10, mask0, mask1, mask2, mask3,
-                                       filt_horiz0, filt_horiz1, filt_horiz2,
-                                       filt_horiz3);
-    horiz_out8 = (v8i16)__msa_sldi_b((v16i8)horiz_out9, (v16i8)horiz_out7, 8);
-
-    out4 = (v8i16)__msa_ilvev_b((v16i8)horiz_out9, (v16i8)horiz_out8);
-
-    tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vert0, filt_vert1,
-                               filt_vert2, filt_vert3);
-    tmp0 = SRARI_SATURATE_SIGNED_H(tmp0, FILTER_BITS, 7);
-    tmp1 = SRARI_SATURATE_SIGNED_H(tmp1, FILTER_BITS, 7);
-
-    PCKEV_2B_XORI128_STORE_4_BYTES_4(tmp0, tmp1, dst, dst_stride);
+    hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
+    out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+    tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    SRARI_H2_SH(tmp0, tmp1, FILTER_BITS);
+    SAT_SH2_SH(tmp0, tmp1, 7);
+    out = PCKEV_XORI128_UB(tmp0, tmp1);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
     dst += (4 * dst_stride);
 
-    horiz_out5 = horiz_out9;
-
+    hz_out5 = hz_out9;
     out0 = out2;
     out1 = out3;
     out2 = out4;
@@ -125,108 +100,87 @@
                                      int32_t height) {
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16i8 filt_horiz0, filt_horiz1, filt_horiz2, filt_horiz3;
-  v8i16 filt_horiz, filt, filt_vert0, filt_vert1, filt_vert2, filt_vert3;
-  v16u8 mask0, mask1, mask2, mask3;
-  v8i16 horiz_out0, horiz_out1, horiz_out2, horiz_out3;
-  v8i16 horiz_out4, horiz_out5, horiz_out6, horiz_out7;
-  v8i16 horiz_out8, horiz_out9, horiz_out10;
+  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
+  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
   v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
-  v8i16 tmp0, tmp1, tmp2, tmp3;
 
-  mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
-
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
   src -= (3 + 3 * src_stride);
 
   /* rearranging filter */
-  filt_horiz = LOAD_SH(filter_horiz);
-  filt_horiz0 = (v16i8)__msa_splati_h(filt_horiz, 0);
-  filt_horiz1 = (v16i8)__msa_splati_h(filt_horiz, 1);
-  filt_horiz2 = (v16i8)__msa_splati_h(filt_horiz, 2);
-  filt_horiz3 = (v16i8)__msa_splati_h(filt_horiz, 3);
+  filt = LD_SH(filter_horiz);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
 
   mask1 = mask0 + 2;
   mask2 = mask0 + 4;
   mask3 = mask0 + 6;
 
-  LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
   src += (7 * src_stride);
 
-  XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
-                  src0, src1, src2, src3, src4, src5, src6, 128);
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
 
-  horiz_out0 = HORIZ_8TAP_FILT(src0, mask0, mask1, mask2, mask3, filt_horiz0,
-                               filt_horiz1, filt_horiz2, filt_horiz3);
-  horiz_out1 = HORIZ_8TAP_FILT(src1, mask0, mask1, mask2, mask3, filt_horiz0,
-                               filt_horiz1, filt_horiz2, filt_horiz3);
-  horiz_out2 = HORIZ_8TAP_FILT(src2, mask0, mask1, mask2, mask3, filt_horiz0,
-                               filt_horiz1, filt_horiz2, filt_horiz3);
-  horiz_out3 = HORIZ_8TAP_FILT(src3, mask0, mask1, mask2, mask3, filt_horiz0,
-                               filt_horiz1, filt_horiz2, filt_horiz3);
-  horiz_out4 = HORIZ_8TAP_FILT(src4, mask0, mask1, mask2, mask3, filt_horiz0,
-                               filt_horiz1, filt_horiz2, filt_horiz3);
-  horiz_out5 = HORIZ_8TAP_FILT(src5, mask0, mask1, mask2, mask3, filt_horiz0,
-                               filt_horiz1, filt_horiz2, filt_horiz3);
-  horiz_out6 = HORIZ_8TAP_FILT(src6, mask0, mask1, mask2, mask3, filt_horiz0,
-                               filt_horiz1, filt_horiz2, filt_horiz3);
+  filt = LD_SH(filter_vert);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
 
-  filt = LOAD_SH(filter_vert);
-  filt_vert0 = __msa_splati_h(filt, 0);
-  filt_vert1 = __msa_splati_h(filt, 1);
-  filt_vert2 = __msa_splati_h(filt, 2);
-  filt_vert3 = __msa_splati_h(filt, 3);
+  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
+  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
+  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
 
-  out0 = (v8i16)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
-  out1 = (v8i16)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
-  out2 = (v8i16)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4);
-  out4 = (v8i16)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out1);
-  out5 = (v8i16)__msa_ilvev_b((v16i8)horiz_out4, (v16i8)horiz_out3);
-  out6 = (v8i16)__msa_ilvev_b((v16i8)horiz_out6, (v16i8)horiz_out5);
-
   for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
     src += (4 * src_stride);
 
-    XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128);
+    XORI_B4_128_SB(src7, src8, src9, src10);
 
-    horiz_out7 = HORIZ_8TAP_FILT(src7, mask0, mask1, mask2, mask3, filt_horiz0,
-                                 filt_horiz1, filt_horiz2, filt_horiz3);
+    hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
+    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
 
-    out3 = (v8i16)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6);
-    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vert0, filt_vert1,
-                               filt_vert2, filt_vert3);
-    tmp0 = SRARI_SATURATE_SIGNED_H(tmp0, FILTER_BITS, 7);
+    hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
+    tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
 
-    horiz_out8 = HORIZ_8TAP_FILT(src8, mask0, mask1, mask2, mask3, filt_horiz0,
-                                 filt_horiz1, filt_horiz2, filt_horiz3);
+    hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
+                              filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
+    tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
 
-    out7 = (v8i16)__msa_ilvev_b((v16i8)horiz_out8, (v16i8)horiz_out7);
-    tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vert0, filt_vert1,
-                               filt_vert2, filt_vert3);
-    tmp1 = SRARI_SATURATE_SIGNED_H(tmp1, FILTER_BITS, 7);
-
-    horiz_out9 = HORIZ_8TAP_FILT(src9, mask0, mask1, mask2, mask3, filt_horiz0,
-                                 filt_horiz1, filt_horiz2, filt_horiz3);
-
-    out8 = (v8i16)__msa_ilvev_b((v16i8)horiz_out9, (v16i8)horiz_out8);
-    tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vert0, filt_vert1,
-                               filt_vert2, filt_vert3);
-    tmp2 = SRARI_SATURATE_SIGNED_H(tmp2, FILTER_BITS, 7);
-
-    horiz_out10 = HORIZ_8TAP_FILT(src10, mask0, mask1, mask2, mask3,
-                                  filt_horiz0, filt_horiz1, filt_horiz2,
-                                  filt_horiz3);
-
-    out9 = (v8i16)__msa_ilvev_b((v16i8)horiz_out10, (v16i8)horiz_out9);
-    tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vert0, filt_vert1,
-                               filt_vert2, filt_vert3);
-    tmp3 = SRARI_SATURATE_SIGNED_H(tmp3, FILTER_BITS, 7);
-
-    PCKEV_B_4_XORI128_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+    hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
+                               filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+    out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
+    tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+    vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
+    vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
+    ST8x4_UB(vec0, vec1, dst, dst_stride);
     dst += (4 * dst_stride);
 
-    horiz_out6 = horiz_out10;
-
+    hz_out6 = hz_out10;
     out0 = out2;
     out1 = out3;
     out2 = out8;
@@ -279,62 +233,32 @@
                                       uint8_t *dst, int32_t dst_stride,
                                       int8_t *filter_horiz,
                                       int8_t *filter_vert) {
-  uint32_t out0, out1, out2, out3;
   v16i8 src0, src1, src2, src3, src4, mask;
-  v16u8 res0, res1, horiz_vec;
-  v16u8 filt_vert, filt_horiz, vec0, vec1;
-  v8u16 filt, tmp0, tmp1;
-  v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3, horiz_out4;
+  v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
 
-  mask = LOAD_SB(&mc_filt_mask_arr[16]);
+  mask = LD_SB(&mc_filt_mask_arr[16]);
 
   /* rearranging filter */
-  filt = LOAD_UH(filter_horiz);
-  filt_horiz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+  filt = LD_UH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
-  filt = LOAD_UH(filter_vert);
-  filt_vert = (v16u8)__msa_splati_h((v8i16)filt, 0);
+  filt = LD_UH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
-  LOAD_5VECS_SB(src, src_stride, src0, src1, src2, src3, src4);
+  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+  hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+  hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
 
-  horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src0);
-  horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-  horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
-
-  horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src2);
-  horiz_out2 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-  horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_out2, FILTER_BITS, 7);
-
-  horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
-  horiz_out4 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-  horiz_out4 = SRARI_SATURATE_UNSIGNED_H(horiz_out4, FILTER_BITS, 7);
-
-  horiz_out1 = (v8u16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8);
-  horiz_out3 = (v8u16)__msa_pckod_d((v2i64)horiz_out4, (v2i64)horiz_out2);
-
-  vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
-  vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
-
-  tmp0 = __msa_dotp_u_h(vec0, filt_vert);
-  tmp1 = __msa_dotp_u_h(vec1, filt_vert);
-  tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
-  tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
-
-  res0 = (v16u8)__msa_pckev_b((v16i8)tmp0, (v16i8)tmp0);
-  res1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp1);
-
-  out0 = __msa_copy_u_w((v4i32)res0, 0);
-  out1 = __msa_copy_u_w((v4i32)res0, 1);
-  out2 = __msa_copy_u_w((v4i32)res1, 0);
-  out3 = __msa_copy_u_w((v4i32)res1, 1);
-
-  STORE_WORD(dst, out0);
-  dst += dst_stride;
-  STORE_WORD(dst, out1);
-  dst += dst_stride;
-  STORE_WORD(dst, out2);
-  dst += dst_stride;
-  STORE_WORD(dst, out3);
+  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+  SAT_UH2_UH(tmp0, tmp1, 7);
+  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 }
 
 static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
@@ -341,113 +265,57 @@
                                       uint8_t *dst, int32_t dst_stride,
                                       int8_t *filter_horiz,
                                       int8_t *filter_vert) {
-  uint32_t out0, out1, out2, out3;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
-  v16u8 filt_horiz, filt_vert, horiz_vec;
-  v16u8 vec0, vec1, vec2, vec3;
-  v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3;
-  v8u16 vec4, vec5, vec6, vec7, filt;
-  v8u16 horiz_out4, horiz_out5, horiz_out6, horiz_out7, horiz_out8;
   v16i8 res0, res1, res2, res3;
+  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
 
-  mask = LOAD_SB(&mc_filt_mask_arr[16]);
+  mask = LD_SB(&mc_filt_mask_arr[16]);
 
   /* rearranging filter */
-  filt = LOAD_UH(filter_horiz);
-  filt_horiz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+  filt = LD_UH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
-  filt = LOAD_UH(filter_vert);
-  filt_vert = (v16u8)__msa_splati_h((v8i16)filt, 0);
+  filt = LD_UH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
-  LOAD_8VECS_SB(src, src_stride,
-                src0, src1, src2, src3, src4, src5, src6, src7);
+  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
   src += (8 * src_stride);
-  src8 = LOAD_SB(src);
+  src8 = LD_SB(src);
 
-  horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src0);
-  horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-  horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
+  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
+  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
+  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
+             hz_out3, hz_out5, 8);
+  hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
 
-  horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src2);
-  horiz_out2 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-  horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_out2, FILTER_BITS, 7);
-
-  horiz_vec = (v16u8)__msa_vshf_b(mask, src5, src4);
-  horiz_out4 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-  horiz_out4 = SRARI_SATURATE_UNSIGNED_H(horiz_out4, FILTER_BITS, 7);
-
-  horiz_vec = (v16u8)__msa_vshf_b(mask, src7, src6);
-  horiz_out6 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-  horiz_out6 = SRARI_SATURATE_UNSIGNED_H(horiz_out6, FILTER_BITS, 7);
-
-  horiz_vec = (v16u8)__msa_vshf_b(mask, src8, src8);
-  horiz_out8 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-  horiz_out8 = SRARI_SATURATE_UNSIGNED_H(horiz_out8, FILTER_BITS, 7);
-
-  horiz_out1 = (v8u16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8);
-  horiz_out3 = (v8u16)__msa_sldi_b((v16i8)horiz_out4, (v16i8)horiz_out2, 8);
-  horiz_out5 = (v8u16)__msa_sldi_b((v16i8)horiz_out6, (v16i8)horiz_out4, 8);
-  horiz_out7 = (v8u16)__msa_pckod_d((v2i64)horiz_out8, (v2i64)horiz_out6);
-
-  vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
-  vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
-  vec2 = (v16u8)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4);
-  vec3 = (v16u8)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6);
-
-  vec4 = __msa_dotp_u_h(vec0, filt_vert);
-  vec5 = __msa_dotp_u_h(vec1, filt_vert);
-  vec6 = __msa_dotp_u_h(vec2, filt_vert);
-  vec7 = __msa_dotp_u_h(vec3, filt_vert);
-
-  vec4 = SRARI_SATURATE_UNSIGNED_H(vec4, FILTER_BITS, 7);
-  vec5 = SRARI_SATURATE_UNSIGNED_H(vec5, FILTER_BITS, 7);
-  vec6 = SRARI_SATURATE_UNSIGNED_H(vec6, FILTER_BITS, 7);
-  vec7 = SRARI_SATURATE_UNSIGNED_H(vec7, FILTER_BITS, 7);
-
-  res0 = __msa_pckev_b((v16i8)vec4, (v16i8)vec4);
-  res1 = __msa_pckev_b((v16i8)vec5, (v16i8)vec5);
-  res2 = __msa_pckev_b((v16i8)vec6, (v16i8)vec6);
-  res3 = __msa_pckev_b((v16i8)vec7, (v16i8)vec7);
-
-  out0 = __msa_copy_u_w((v4i32)res0, 0);
-  out1 = __msa_copy_u_w((v4i32)res0, 1);
-  out2 = __msa_copy_u_w((v4i32)res1, 0);
-  out3 = __msa_copy_u_w((v4i32)res1, 1);
-
-  STORE_WORD(dst, out0);
-  dst += dst_stride;
-  STORE_WORD(dst, out1);
-  dst += dst_stride;
-  STORE_WORD(dst, out2);
-  dst += dst_stride;
-  STORE_WORD(dst, out3);
-  dst += dst_stride;
-
-  out0 = __msa_copy_u_w((v4i32)res2, 0);
-  out1 = __msa_copy_u_w((v4i32)res2, 1);
-  out2 = __msa_copy_u_w((v4i32)res3, 0);
-  out3 = __msa_copy_u_w((v4i32)res3, 1);
-
-  STORE_WORD(dst, out0);
-  dst += dst_stride;
-  STORE_WORD(dst, out1);
-  dst += dst_stride;
-  STORE_WORD(dst, out2);
-  dst += dst_stride;
-  STORE_WORD(dst, out3);
+  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
+              vec4, vec5, vec6, vec7);
+  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
+  SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
+  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
+              res2, res3);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+  dst += (4 * dst_stride);
+  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
 }
 
 static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
                                      uint8_t *dst, int32_t dst_stride,
-                                     int8_t *filter_horiz,
-                                     int8_t *filter_vert,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
                                      int32_t height) {
   if (4 == height) {
-    common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
-                              filter_horiz, filter_vert);
+    common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
   } else if (8 == height) {
-    common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
-                              filter_horiz, filter_vert);
+    common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
   }
 }
 
@@ -455,63 +323,43 @@
                                       uint8_t *dst, int32_t dst_stride,
                                       int8_t *filter_horiz,
                                       int8_t *filter_vert) {
-  v16i8 src0, src1, src2, src3, src4, mask;
-  v16u8 filt_horiz, filt_vert, horiz_vec;
-  v16u8 vec0, vec1, vec2, vec3;
-  v8u16 horiz_out0, horiz_out1;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
   v8i16 filt;
 
-  mask = LOAD_SB(&mc_filt_mask_arr[0]);
+  mask = LD_SB(&mc_filt_mask_arr[0]);
 
   /* rearranging filter */
-  filt = LOAD_SH(filter_horiz);
-  filt_horiz = (v16u8)__msa_splati_h(filt, 0);
+  filt = LD_SH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h(filt, 0);
 
-  filt = LOAD_SH(filter_vert);
-  filt_vert = (v16u8)__msa_splati_h(filt, 0);
+  filt = LD_SH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h(filt, 0);
 
-  LOAD_5VECS_SB(src, src_stride, src0, src1, src2, src3, src4);
-  src += (5 * src_stride);
+  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
 
-  horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0);
-  horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-  horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+  vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
 
-  horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
-  horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-  horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
+  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+  vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
 
-  vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
-  tmp0 = __msa_dotp_u_h(vec0, filt_vert);
+  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+  vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
 
-  horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2);
-  horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-  horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+  vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
 
-  vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
-  tmp1 = __msa_dotp_u_h(vec1, filt_vert);
-
-  horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3);
-  horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-  horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
-
-  vec2 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
-  tmp2 = __msa_dotp_u_h(vec2, filt_vert);
-
-  horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
-  horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-  horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
-
-  vec3 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
-  tmp3 = __msa_dotp_u_h(vec3, filt_vert);
-
-  tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
-  tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
-  tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
-  tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
-
-  PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+  ST8x4_UB(out0, out1, dst, dst_stride);
 }
 
 static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
@@ -522,106 +370,76 @@
                                           int8_t *filter_vert,
                                           int32_t height) {
   uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, mask;
-  v16u8 filt_horiz, filt_vert, vec0, horiz_vec;
-  v8u16 horiz_out0, horiz_out1;
-  v8u16 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
+  v16u8 filt_hz, filt_vt, vec0;
+  v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
   v8i16 filt;
 
-  mask = LOAD_SB(&mc_filt_mask_arr[0]);
+  mask = LD_SB(&mc_filt_mask_arr[0]);
 
   /* rearranging filter */
-  filt = LOAD_SH(filter_horiz);
-  filt_horiz = (v16u8)__msa_splati_h(filt, 0);
+  filt = LD_SH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h(filt, 0);
 
-  filt = LOAD_SH(filter_vert);
-  filt_vert = (v16u8)__msa_splati_h(filt, 0);
+  filt = LD_SH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h(filt, 0);
 
-  src0 = LOAD_SB(src);
+  src0 = LD_SB(src);
   src += src_stride;
 
-  horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0);
-  horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-  horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
 
   for (loop_cnt = (height >> 3); loop_cnt--;) {
-    LOAD_4VECS_SB(src, src_stride, src1, src2, src3, src4);
+    LD_SB4(src, src_stride, src1, src2, src3, src4);
     src += (4 * src_stride);
 
-    horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
-    horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-    horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
 
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
-    tmp1 = __msa_dotp_u_h(vec0, filt_vert);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
 
-    horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2);
-    horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-    horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+    SAT_UH2_UH(tmp1, tmp2, 7);
 
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
-    tmp2 = (v8u16)__msa_dotp_u_h(vec0, filt_vert);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
 
-    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
-    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
-
-    horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3);
-    horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-    horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
-
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
-    tmp3 = __msa_dotp_u_h(vec0, filt_vert);
-
-    horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
-    horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-    horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
-
-    LOAD_4VECS_SB(src, src_stride, src1, src2, src3, src4);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    LD_SB4(src, src_stride, src1, src2, src3, src4);
     src += (4 * src_stride);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp4 = __msa_dotp_u_h(vec0, filt_vt);
 
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
-    tmp4 = __msa_dotp_u_h(vec0, filt_vert);
-
-    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
-    tmp4 = SRARI_SATURATE_UNSIGNED_H(tmp4, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_8_BYTES_4(tmp1, tmp2, tmp3, tmp4, dst, dst_stride);
+    SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
+    SAT_UH2_UH(tmp3, tmp4, 7);
+    PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
     dst += (4 * dst_stride);
 
-    horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
-    horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-    horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp5 = __msa_dotp_u_h(vec0, filt_vt);
 
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
-    tmp5 = __msa_dotp_u_h(vec0, filt_vert);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp6 = __msa_dotp_u_h(vec0, filt_vt);
 
-    horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2);
-    horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-    horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp7 = __msa_dotp_u_h(vec0, filt_vt);
 
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
-    tmp6 = __msa_dotp_u_h(vec0, filt_vert);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp8 = __msa_dotp_u_h(vec0, filt_vt);
 
-    horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3);
-    horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-    horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
-
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
-    tmp7 = __msa_dotp_u_h(vec0, filt_vert);
-
-    horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
-    horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-    horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
-
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
-    tmp8 = __msa_dotp_u_h(vec0, filt_vert);
-
-    tmp5 = SRARI_SATURATE_UNSIGNED_H(tmp5, FILTER_BITS, 7);
-    tmp6 = SRARI_SATURATE_UNSIGNED_H(tmp6, FILTER_BITS, 7);
-    tmp7 = SRARI_SATURATE_UNSIGNED_H(tmp7, FILTER_BITS, 7);
-    tmp8 = SRARI_SATURATE_UNSIGNED_H(tmp8, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_8_BYTES_4(tmp5, tmp6, tmp7, tmp8, dst, dst_stride);
+    SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
+    SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
+    PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
     dst += (4 * dst_stride);
   }
 }
@@ -645,108 +463,64 @@
                                       int32_t height) {
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt_horiz, filt_vert, vec0, horiz_vec;
-  v8u16 horiz_vec0, horiz_vec1, tmp1, tmp2;
-  v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3;
+  v16u8 filt_hz, filt_vt, vec0, vec1;
+  v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
   v8i16 filt;
 
-  mask = LOAD_SB(&mc_filt_mask_arr[0]);
+  mask = LD_SB(&mc_filt_mask_arr[0]);
 
   /* rearranging filter */
-  filt = LOAD_SH(filter_horiz);
-  filt_horiz = (v16u8)__msa_splati_h(filt, 0);
+  filt = LD_SH(filter_horiz);
+  filt_hz = (v16u8)__msa_splati_h(filt, 0);
 
-  filt = LOAD_SH(filter_vert);
-  filt_vert = (v16u8)__msa_splati_h(filt, 0);
+  filt = LD_SH(filter_vert);
+  filt_vt = (v16u8)__msa_splati_h(filt, 0);
 
-  src0 = LOAD_SB(src);
-  src1 = LOAD_SB(src + 8);
-
-  horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0);
-  horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-  horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
-
-  horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
-  horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-  horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
-
+  LD_SB2(src, 8, src0, src1);
   src += src_stride;
 
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
   for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LOAD_4VECS_SB(src, src_stride, src0, src2, src4, src6);
-    LOAD_4VECS_SB(src + 8, src_stride, src1, src3, src5, src7);
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
     src += (4 * src_stride);
 
-    horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0);
-    horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-    horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
-
-    horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
-    horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-    horiz_out3 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
-
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
-    tmp1 = __msa_dotp_u_h(vec0, filt_vert);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
-    tmp2 = __msa_dotp_u_h(vec0, filt_vert);
-    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
-    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+    SAT_UH2_UH(tmp1, tmp2, 7);
+    PCKEV_ST_SB(tmp1, tmp2, dst);
     dst += dst_stride;
 
-    horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2);
-    horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-    horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
-
-    horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3);
-    horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-    horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
-
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
-    tmp1 = __msa_dotp_u_h(vec0, filt_vert);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out3);
-    tmp2 = __msa_dotp_u_h(vec0, filt_vert);
-    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
-    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+    SAT_UH2_UH(tmp1, tmp2, 7);
+    PCKEV_ST_SB(tmp1, tmp2, dst);
     dst += dst_stride;
 
-    horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
-    horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-    horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
-
-    horiz_vec = (v16u8)__msa_vshf_b(mask, src5, src5);
-    horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-    horiz_out3 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
-
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
-    tmp1 = __msa_dotp_u_h(vec0, filt_vert);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
-    tmp2 = __msa_dotp_u_h(vec0, filt_vert);
-    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
-    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+    SAT_UH2_UH(tmp1, tmp2, 7);
+    PCKEV_ST_SB(tmp1, tmp2, dst);
     dst += dst_stride;
 
-    horiz_vec = (v16u8)__msa_vshf_b(mask, src6, src6);
-    horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-    horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
-
-    horiz_vec = (v16u8)__msa_vshf_b(mask, src7, src7);
-    horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
-    horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
-
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
-    tmp1 = __msa_dotp_u_h(vec0, filt_vert);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out3);
-    tmp2 = __msa_dotp_u_h(vec0, filt_vert);
-    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
-    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
+    SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
+    SAT_UH2_UH(tmp1, tmp2, 7);
+    PCKEV_ST_SB(tmp1, tmp2, dst);
     dst += dst_stride;
   }
 }
--- a/vp9/common/mips/msa/vp9_convolve8_vert_msa.c
+++ b/vp9/common/mips/msa/vp9_convolve8_vert_msa.c
@@ -16,58 +16,48 @@
                                 int8_t *filter, int32_t height) {
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
-  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
-  v16i8 src2110, src4332, src6554, src8776, src10998;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+  v16i8 src10998, filt0, filt1, filt2, filt3;
+  v16u8 out;
   v8i16 filt, out10, out32;
-  v16i8 filt0, filt1, filt2, filt3;
 
   src -= (3 * src_stride);
 
-  filt = LOAD_SH(filter);
-  filt0 = (v16i8)__msa_splati_h(filt, 0);
-  filt1 = (v16i8)__msa_splati_h(filt, 1);
-  filt2 = (v16i8)__msa_splati_h(filt, 2);
-  filt3 = (v16i8)__msa_splati_h(filt, 3);
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
-  LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
   src += (7 * src_stride);
 
-  ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
-                  src1, src3, src5, src2, src4, src6,
-                  src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
+  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+             src54_r, src21_r);
+  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+             src4332, src6554);
+  XORI_B3_128_SB(src2110, src4332, src6554);
 
-  ILVR_D_3VECS_SB(src2110, src21_r, src10_r, src4332, src43_r, src32_r,
-                  src6554, src65_r, src54_r);
-
-  XORI_B_3VECS_SB(src2110, src4332, src6554, src2110, src4332, src6554, 128);
-
   for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
     src += (4 * src_stride);
 
-    ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
-                    src76_r, src87_r, src98_r, src109_r);
-
-    ILVR_D_2VECS_SB(src8776, src87_r, src76_r, src10998, src109_r, src98_r);
-
-    XORI_B_2VECS_SB(src8776, src10998, src8776, src10998, 128);
-
-    out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776,
-                                filt0, filt1, filt2, filt3);
-    out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998,
-                                filt0, filt1, filt2, filt3);
-
-    out10 = SRARI_SATURATE_SIGNED_H(out10, FILTER_BITS, 7);
-    out32 = SRARI_SATURATE_SIGNED_H(out32, FILTER_BITS, 7);
-
-    PCKEV_2B_XORI128_STORE_4_BYTES_4(out10, out32, dst, dst_stride);
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+    XORI_B2_128_SB(src8776, src10998);
+    out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+                                filt1, filt2, filt3);
+    out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+                                filt1, filt2, filt3);
+    SRARI_H2_SH(out10, out32, FILTER_BITS);
+    SAT_SH2_SH(out10, out32, 7);
+    out = PCKEV_XORI128_UB(out10, out32);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
     dst += (4 * dst_stride);
 
     src2110 = src6554;
     src4332 = src8776;
     src6554 = src10998;
-
     src6 = src10;
   }
 }
@@ -77,54 +67,115 @@
                                 int8_t *filter, int32_t height) {
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
-  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
-  v16i8 filt0, filt1, filt2, filt3;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+  v16u8 tmp0, tmp1;
   v8i16 filt, out0_r, out1_r, out2_r, out3_r;
 
   src -= (3 * src_stride);
 
-  filt = LOAD_SH(filter);
-  filt0 = (v16i8)__msa_splati_h(filt, 0);
-  filt1 = (v16i8)__msa_splati_h(filt, 1);
-  filt2 = (v16i8)__msa_splati_h(filt, 2);
-  filt3 = (v16i8)__msa_splati_h(filt, 3);
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
-  LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
   src += (7 * src_stride);
+  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+             src54_r, src21_r);
+  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
 
-  XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
-                  src0, src1, src2, src3, src4, src5, src6, 128);
-
-  ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
-                  src1, src3, src5, src2, src4, src6,
-                  src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
-
   for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    XORI_B4_128_SB(src7, src8, src9, src10);
     src += (4 * src_stride);
 
-    XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128);
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                 filt1, filt2, filt3);
+    out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                 filt1, filt2, filt3);
+    out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                 filt1, filt2, filt3);
+    out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                 filt1, filt2, filt3);
+    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+    tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+    tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+    dst += (4 * dst_stride);
 
-    ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
-                    src76_r, src87_r, src98_r, src109_r);
+    src10_r = src54_r;
+    src32_r = src76_r;
+    src54_r = src98_r;
+    src21_r = src65_r;
+    src43_r = src87_r;
+    src65_r = src109_r;
+    src6 = src10;
+  }
+}
 
-    out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
-                                 filt0, filt1, filt2, filt3);
-    out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
-                                 filt0, filt1, filt2, filt3);
-    out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
-                                 filt0, filt1, filt2, filt3);
-    out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
-                                 filt0, filt1, filt2, filt3);
+static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 filt0, filt1, filt2, filt3;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+  v16u8 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
 
-    out0_r = SRARI_SATURATE_SIGNED_H(out0_r, FILTER_BITS, 7);
-    out1_r = SRARI_SATURATE_SIGNED_H(out1_r, FILTER_BITS, 7);
-    out2_r = SRARI_SATURATE_SIGNED_H(out2_r, FILTER_BITS, 7);
-    out3_r = SRARI_SATURATE_SIGNED_H(out3_r, FILTER_BITS, 7);
+  src -= (3 * src_stride);
 
-    PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0_r, out1_r, out2_r, out3_r,
-                                      dst, dst_stride);
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+             src54_r, src21_r);
+  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+             src54_l, src21_l);
+  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    XORI_B4_128_SB(src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+               src87_l, src98_l, src109_l);
+    out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                 filt1, filt2, filt3);
+    out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                 filt1, filt2, filt3);
+    out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                 filt1, filt2, filt3);
+    out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                 filt1, filt2, filt3);
+    out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+                                 filt1, filt2, filt3);
+    out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+                                 filt1, filt2, filt3);
+    out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+                                 filt1, filt2, filt3);
+    out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+                                 filt1, filt2, filt3);
+    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+    SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+    SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+    PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
+                tmp0, tmp1, tmp2, tmp3);
+    XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+    ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
     dst += (4 * dst_stride);
 
     src10_r = src54_r;
@@ -133,7 +184,12 @@
     src21_r = src65_r;
     src43_r = src87_r;
     src65_r = src109_r;
-
+    src10_l = src54_l;
+    src32_l = src76_l;
+    src54_l = src98_l;
+    src21_l = src65_l;
+    src43_l = src87_l;
+    src65_l = src109_l;
     src6 = src10;
   }
 }
@@ -147,89 +203,63 @@
   uint32_t loop_cnt, cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
   v16i8 filt0, filt1, filt2, filt3;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
-  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
-  v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
-  v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
-  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
-  v8i16 filt;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
   v16u8 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
 
   src -= (3 * src_stride);
 
-  filt = LOAD_SH(filter);
-  filt0 = (v16i8)__msa_splati_h(filt, 0);
-  filt1 = (v16i8)__msa_splati_h(filt, 1);
-  filt2 = (v16i8)__msa_splati_h(filt, 2);
-  filt3 = (v16i8)__msa_splati_h(filt, 3);
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
 
   for (cnt = (width >> 4); cnt--;) {
     src_tmp = src;
     dst_tmp = dst;
 
-    LOAD_7VECS_SB(src_tmp, src_stride,
-                  src0, src1, src2, src3, src4, src5, src6);
+    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
     src_tmp += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
+               src32_r, src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
+               src32_l, src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
 
-    XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
-                    src0, src1, src2, src3, src4, src5, src6, 128);
-
-    ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
-                    src1, src3, src5, src2, src4, src6,
-                    src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
-
-    ILVL_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
-                    src1, src3, src5, src2, src4, src6,
-                    src10_l, src32_l, src54_l, src21_l, src43_l, src65_l);
-
     for (loop_cnt = (height >> 2); loop_cnt--;) {
-      LOAD_4VECS_SB(src_tmp, src_stride, src7, src8, src9, src10);
+      LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+      XORI_B4_128_SB(src7, src8, src9, src10);
       src_tmp += (4 * src_stride);
-
-      XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128);
-
-      ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
-                      src76_r, src87_r, src98_r, src109_r);
-
-      ILVL_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
-                      src76_l, src87_l, src98_l, src109_l);
-
-      out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
-                                   filt0, filt1, filt2, filt3);
-      out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
-                                   filt0, filt1, filt2, filt3);
-      out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
-                                   filt0, filt1, filt2, filt3);
-      out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
-                                   filt0, filt1, filt2, filt3);
-
-      out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
-                                   filt0, filt1, filt2, filt3);
-      out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
-                                   filt0, filt1, filt2, filt3);
-      out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
-                                   filt0, filt1, filt2, filt3);
-      out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
-                                   filt0, filt1, filt2, filt3);
-
-      out0_r = SRARI_SATURATE_SIGNED_H(out0_r, FILTER_BITS, 7);
-      out1_r = SRARI_SATURATE_SIGNED_H(out1_r, FILTER_BITS, 7);
-      out2_r = SRARI_SATURATE_SIGNED_H(out2_r, FILTER_BITS, 7);
-      out3_r = SRARI_SATURATE_SIGNED_H(out3_r, FILTER_BITS, 7);
-      out0_l = SRARI_SATURATE_SIGNED_H(out0_l, FILTER_BITS, 7);
-      out1_l = SRARI_SATURATE_SIGNED_H(out1_l, FILTER_BITS, 7);
-      out2_l = SRARI_SATURATE_SIGNED_H(out2_l, FILTER_BITS, 7);
-      out3_l = SRARI_SATURATE_SIGNED_H(out3_l, FILTER_BITS, 7);
-
-      out0_r = (v8i16)__msa_pckev_b((v16i8)out0_l, (v16i8)out0_r);
-      out1_r = (v8i16)__msa_pckev_b((v16i8)out1_l, (v16i8)out1_r);
-      out2_r = (v8i16)__msa_pckev_b((v16i8)out2_l, (v16i8)out2_r);
-      out3_r = (v8i16)__msa_pckev_b((v16i8)out3_l, (v16i8)out3_r);
-
-      XORI_B_4VECS_UB(out0_r, out1_r, out2_r, out3_r,
-                      tmp0, tmp1, tmp2, tmp3, 128);
-
-      STORE_4VECS_UB(dst_tmp, dst_stride, tmp0, tmp1, tmp2, tmp3);
+      ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                 src87_r, src98_r, src109_r);
+      ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                 src87_l, src98_l, src109_l);
+      out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                   filt1, filt2, filt3);
+      out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                   filt1, filt2, filt3);
+      out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                   filt1, filt2, filt3);
+      out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                   filt1, filt2, filt3);
+      out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+                                   filt1, filt2, filt3);
+      out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+                                   filt1, filt2, filt3);
+      out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+                                   filt1, filt2, filt3);
+      out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+                                   filt1, filt2, filt3);
+      SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+      SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+      SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+      SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+      PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                  out3_r, tmp0, tmp1, tmp2, tmp3);
+      XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+      ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
       dst_tmp += (4 * dst_stride);
 
       src10_r = src54_r;
@@ -238,7 +268,6 @@
       src21_r = src65_r;
       src43_r = src87_r;
       src65_r = src109_r;
-
       src10_l = src54_l;
       src32_l = src76_l;
       src54_l = src98_l;
@@ -245,7 +274,6 @@
       src21_l = src65_l;
       src43_l = src87_l;
       src65_l = src109_l;
-
       src6 = src10;
     }
 
@@ -254,134 +282,77 @@
   }
 }
 
-static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride,
-                            filter, height, 16);
-}
-
 static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
                                  uint8_t *dst, int32_t dst_stride,
                                  int8_t *filter, int32_t height) {
-  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride,
-                            filter, height, 32);
+  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                            32);
 }
 
 static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
                                  uint8_t *dst, int32_t dst_stride,
                                  int8_t *filter, int32_t height) {
-  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride,
-                            filter, height, 64);
+  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                            64);
 }
 
 static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
                                  uint8_t *dst, int32_t dst_stride,
                                  int8_t *filter) {
-  uint32_t out0, out1, out2, out3;
   v16i8 src0, src1, src2, src3, src4;
   v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
-  v16i8 filt0;
-  v8u16 filt;
+  v16u8 filt0;
+  v8i16 filt;
+  v8u16 tmp0, tmp1;
 
-  filt = LOAD_UH(filter);
-  filt0 = (v16i8)__msa_splati_h((v8i16)filt, 0);
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
 
-  LOAD_5VECS_SB(src, src_stride, src0, src1, src2, src3, src4);
+  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
   src += (5 * src_stride);
 
-  ILVR_B_4VECS_SB(src0, src1, src2, src3, src1, src2, src3, src4,
-                  src10_r, src21_r, src32_r, src43_r);
-
-  ILVR_D_2VECS_SB(src2110, src21_r, src10_r, src4332, src43_r, src32_r);
-
-  src2110 = (v16i8)__msa_dotp_u_h((v16u8)src2110, (v16u8)filt0);
-  src4332 = (v16i8)__msa_dotp_u_h((v16u8)src4332, (v16u8)filt0);
-
-  src2110 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src2110, FILTER_BITS, 7);
-  src4332 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src4332, FILTER_BITS, 7);
-
-  src2110 = (v16i8)__msa_pckev_b((v16i8)src4332, (v16i8)src2110);
-
-  out0 = __msa_copy_u_w((v4i32)src2110, 0);
-  out1 = __msa_copy_u_w((v4i32)src2110, 1);
-  out2 = __msa_copy_u_w((v4i32)src2110, 2);
-  out3 = __msa_copy_u_w((v4i32)src2110, 3);
-
-  STORE_WORD(dst, out0);
-  dst += dst_stride;
-  STORE_WORD(dst, out1);
-  dst += dst_stride;
-  STORE_WORD(dst, out2);
-  dst += dst_stride;
-  STORE_WORD(dst, out3);
+  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+             src32_r, src43_r);
+  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+  SAT_UH2_UH(tmp0, tmp1, 7);
+  src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
                                  uint8_t *dst, int32_t dst_stride,
                                  int8_t *filter) {
-  uint32_t out0, out1, out2, out3, out4, out5, out6, out7;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
   v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
   v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
-  v16i8 filt0;
-  v8u16 filt;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 filt0;
+  v8i16 filt;
 
-  filt = LOAD_UH(filter);
-  filt0 = (v16i8)__msa_splati_h((v8i16)filt, 0);
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
 
-  LOAD_8VECS_SB(src, src_stride,
-                src0, src1, src2, src3, src4, src5, src6, src7);
+  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
   src += (8 * src_stride);
 
-  src8 = LOAD_SB(src);
+  src8 = LD_SB(src);
   src += src_stride;
 
-  ILVR_B_8VECS_SB(src0, src1, src2, src3, src4, src5, src6, src7,
-                  src1, src2, src3, src4, src5, src6, src7, src8,
-                  src10_r, src21_r, src32_r, src43_r,
-                  src54_r, src65_r, src76_r, src87_r);
-
-  ILVR_D_4VECS_SB(src2110, src21_r, src10_r, src4332, src43_r, src32_r,
-                  src6554, src65_r, src54_r, src8776, src87_r, src76_r);
-
-  src2110 = (v16i8)__msa_dotp_u_h((v16u8)src2110, (v16u8)filt0);
-  src4332 = (v16i8)__msa_dotp_u_h((v16u8)src4332, (v16u8)filt0);
-  src6554 = (v16i8)__msa_dotp_u_h((v16u8)src6554, (v16u8)filt0);
-  src8776 = (v16i8)__msa_dotp_u_h((v16u8)src8776, (v16u8)filt0);
-
-  src2110 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src2110, FILTER_BITS, 7);
-  src4332 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src4332, FILTER_BITS, 7);
-  src6554 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src6554, FILTER_BITS, 7);
-  src8776 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src8776, FILTER_BITS, 7);
-
-  src2110 = (v16i8)__msa_pckev_b((v16i8)src4332, (v16i8)src2110);
-  src4332 = (v16i8)__msa_pckev_b((v16i8)src8776, (v16i8)src6554);
-
-  out0 = __msa_copy_u_w((v4i32)src2110, 0);
-  out1 = __msa_copy_u_w((v4i32)src2110, 1);
-  out2 = __msa_copy_u_w((v4i32)src2110, 2);
-  out3 = __msa_copy_u_w((v4i32)src2110, 3);
-  out4 = __msa_copy_u_w((v4i32)src4332, 0);
-  out5 = __msa_copy_u_w((v4i32)src4332, 1);
-  out6 = __msa_copy_u_w((v4i32)src4332, 2);
-  out7 = __msa_copy_u_w((v4i32)src4332, 3);
-
-  STORE_WORD(dst, out0);
-  dst += dst_stride;
-  STORE_WORD(dst, out1);
-  dst += dst_stride;
-  STORE_WORD(dst, out2);
-  dst += dst_stride;
-  STORE_WORD(dst, out3);
-  dst += dst_stride;
-  STORE_WORD(dst, out4);
-  dst += dst_stride;
-  STORE_WORD(dst, out5);
-  dst += dst_stride;
-  STORE_WORD(dst, out6);
-  dst += dst_stride;
-  STORE_WORD(dst, out7);
+  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+             src32_r, src43_r);
+  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+             src76_r, src87_r);
+  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+             src87_r, src76_r, src2110, src4332, src6554, src8776);
+  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+              tmp0, tmp1, tmp2, tmp3);
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
 }
 
 static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride,
@@ -397,32 +368,24 @@
 static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
                                  uint8_t *dst, int32_t dst_stride,
                                  int8_t *filter) {
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 vec0, vec1, vec2, vec3, filt0;
+  v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+  v16i8 out0, out1;
   v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8u16 filt;
+  v8i16 filt;
 
   /* rearranging filter_y */
-  filt = LOAD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
 
-  LOAD_5VECS_UB(src, src_stride, src0, src1, src2, src3, src4);
-
-  ILVR_B_2VECS_UB(src0, src1, src1, src2, vec0, vec1);
-  ILVR_B_2VECS_UB(src2, src3, src3, src4, vec2, vec3);
-
-  /* filter calc */
-  tmp0 = __msa_dotp_u_h(vec0, filt0);
-  tmp1 = __msa_dotp_u_h(vec1, filt0);
-  tmp2 = __msa_dotp_u_h(vec2, filt0);
-  tmp3 = __msa_dotp_u_h(vec3, filt0);
-
-  tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
-  tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
-  tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
-  tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
-
-  PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+              tmp2, tmp3);
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+  ST8x4_UB(out0, out1, dst, dst_stride);
 }
 
 static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
@@ -431,51 +394,39 @@
   uint32_t loop_cnt;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v16i8 out0, out1;
   v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8u16 filt;
+  v8i16 filt;
 
   /* rearranging filter_y */
-  filt = LOAD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
 
-  src0 = LOAD_UB(src);
+  src0 = LD_UB(src);
   src += src_stride;
 
   for (loop_cnt = (height >> 3); loop_cnt--;) {
-    LOAD_8VECS_UB(src, src_stride,
-                  src1, src2, src3, src4, src5, src6, src7, src8);
+    LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
     src += (8 * src_stride);
 
-    ILVR_B_4VECS_UB(src0, src1, src2, src3, src1, src2, src3, src4,
-                    vec0, vec1, vec2, vec3);
-
-    ILVR_B_4VECS_UB(src4, src5, src6, src7, src5, src6, src7, src8,
-                    vec4, vec5, vec6, vec7);
-
-    tmp0 = __msa_dotp_u_h(vec0, filt0);
-    tmp1 = __msa_dotp_u_h(vec1, filt0);
-    tmp2 = __msa_dotp_u_h(vec2, filt0);
-    tmp3 = __msa_dotp_u_h(vec3, filt0);
-
-    tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
-    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
-    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
-    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1,
+               vec2, vec3);
+    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5,
+               vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+                tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
     dst += (4 * dst_stride);
 
-    tmp0 = __msa_dotp_u_h(vec4, filt0);
-    tmp1 = __msa_dotp_u_h(vec5, filt0);
-    tmp2 = __msa_dotp_u_h(vec6, filt0);
-    tmp3 = __msa_dotp_u_h(vec7, filt0);
-
-    tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
-    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
-    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
-    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
+                tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
     dst += (4 * dst_stride);
 
     src0 = src8;
@@ -499,57 +450,45 @@
   v16u8 src0, src1, src2, src3, src4;
   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
   v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8u16 filt;
+  v8i16 filt;
 
   /* rearranging filter_y */
-  filt = LOAD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
 
-  src0 = LOAD_UB(src);
+  src0 = LD_UB(src);
   src += src_stride;
 
   for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LOAD_4VECS_UB(src, src_stride, src1, src2, src3, src4);
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
     src += (4 * src_stride);
 
-    ILV_B_LRLR_UB(src0, src1, src1, src2, vec1, vec0, vec3, vec2);
-
-    tmp0 = __msa_dotp_u_h(vec0, filt0);
-    tmp1 = __msa_dotp_u_h(vec1, filt0);
-
-    tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
-    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_VEC(tmp1, tmp0, dst);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    PCKEV_ST_SB(tmp0, tmp1, dst);
     dst += dst_stride;
 
-    ILV_B_LRLR_UB(src2, src3, src3, src4, vec5, vec4, vec7, vec6);
-
-    tmp2 = __msa_dotp_u_h(vec2, filt0);
-    tmp3 = __msa_dotp_u_h(vec3, filt0);
-
-    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
-    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_VEC(tmp3, tmp2, dst);
+    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    SAT_UH2_UH(tmp2, tmp3, 7);
+    PCKEV_ST_SB(tmp2, tmp3, dst);
     dst += dst_stride;
 
-    tmp0 = __msa_dotp_u_h(vec4, filt0);
-    tmp1 = __msa_dotp_u_h(vec5, filt0);
-
-    tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
-    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_VEC(tmp1, tmp0, dst);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    PCKEV_ST_SB(tmp0, tmp1, dst);
     dst += dst_stride;
 
-    tmp2 = __msa_dotp_u_h(vec6, filt0);
-    tmp3 = __msa_dotp_u_h(vec7, filt0);
-
-    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
-    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_VEC(tmp3, tmp2, dst);
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    SAT_UH2_UH(tmp2, tmp3, 7);
+    PCKEV_ST_SB(tmp2, tmp3, dst);
     dst += dst_stride;
 
     src0 = src4;
@@ -563,93 +502,68 @@
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
   v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8u16 filt;
+  v8i16 filt;
 
   /* rearranging filter_y */
-  filt = LOAD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
 
-  src0 = LOAD_UB(src);
-  src5 = LOAD_UB(src + 16);
+  src0 = LD_UB(src);
+  src5 = LD_UB(src + 16);
   src += src_stride;
 
   for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LOAD_4VECS_UB(src, src_stride, src1, src2, src3, src4);
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
 
-    ILV_B_LRLR_UB(src0, src1, src1, src2, vec1, vec0, vec3, vec2);
-
-    LOAD_4VECS_UB(src + 16, src_stride, src6, src7, src8, src9);
+    LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
     src += (4 * src_stride);
 
-    tmp0 = __msa_dotp_u_h(vec0, filt0);
-    tmp1 = __msa_dotp_u_h(vec1, filt0);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    PCKEV_ST_SB(tmp0, tmp1, dst);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    SAT_UH2_UH(tmp2, tmp3, 7);
+    PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
 
-    tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
-    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
 
-    PCKEV_B_STORE_VEC(tmp1, tmp0, dst);
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    SAT_UH2_UH(tmp2, tmp3, 7);
+    PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
 
-    tmp2 = __msa_dotp_u_h(vec2, filt0);
-    tmp3 = __msa_dotp_u_h(vec3, filt0);
+    ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
+    ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    PCKEV_ST_SB(tmp0, tmp1, dst + 16);
 
-    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
-    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    SAT_UH2_UH(tmp2, tmp3, 7);
+    PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
 
-    PCKEV_B_STORE_VEC(tmp3, tmp2, dst + dst_stride);
+    ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
+    ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
 
-    ILV_B_LRLR_UB(src2, src3, src3, src4, vec5, vec4, vec7, vec6);
-
-    tmp0 = __msa_dotp_u_h(vec4, filt0);
-    tmp1 = __msa_dotp_u_h(vec5, filt0);
-
-    tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
-    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 2 * dst_stride);
-
-    tmp2 = __msa_dotp_u_h(vec6, filt0);
-    tmp3 = __msa_dotp_u_h(vec7, filt0);
-
-    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
-    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 3 * dst_stride);
-
-    ILV_B_LRLR_UB(src5, src6, src6, src7, vec1, vec0, vec3, vec2);
-
-    tmp0 = __msa_dotp_u_h(vec0, filt0);
-    tmp1 = __msa_dotp_u_h(vec1, filt0);
-
-    tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
-    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 16);
-
-    tmp2 = __msa_dotp_u_h(vec2, filt0);
-    tmp3 = __msa_dotp_u_h(vec3, filt0);
-
-    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
-    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 16 + dst_stride);
-
-    ILV_B_LRLR_UB(src7, src8, src8, src9, vec5, vec4, vec7, vec6);
-
-    tmp0 = __msa_dotp_u_h(vec4, filt0);
-    tmp1 = __msa_dotp_u_h(vec5, filt0);
-
-    tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
-    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 16 + 2 * dst_stride);
-
-    tmp2 = __msa_dotp_u_h(vec6, filt0);
-    tmp3 = __msa_dotp_u_h(vec7, filt0);
-
-    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
-    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 16 + 3 * dst_stride);
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    SAT_UH2_UH(tmp2, tmp3, 7);
+    PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
     dst += (4 * dst_stride);
 
     src0 = src4;
@@ -661,97 +575,72 @@
                                  uint8_t *dst, int32_t dst_stride,
                                  int8_t *filter, int32_t height) {
   uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 src8, src9, src10, src11;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
   v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  v8u16 filt;
+  v8i16 filt;
 
   /* rearranging filter_y */
-  filt = LOAD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
 
-  LOAD_4VECS_UB(src, 16, src0, src3, src6, src9);
+  LD_UB4(src, 16, src0, src3, src6, src9);
   src += src_stride;
 
   for (loop_cnt = (height >> 1); loop_cnt--;) {
-    LOAD_2VECS_UB(src, src_stride, src1, src2);
-    LOAD_2VECS_UB(src + 16, src_stride, src4, src5);
-    LOAD_2VECS_UB(src + 32, src_stride, src7, src8);
-    LOAD_2VECS_UB(src + 48, src_stride, src10, src11);
+    LD_UB2(src, src_stride, src1, src2);
+    LD_UB2(src + 16, src_stride, src4, src5);
+    LD_UB2(src + 32, src_stride, src7, src8);
+    LD_UB2(src + 48, src_stride, src10, src11);
     src += (2 * src_stride);
 
-    ILV_B_LRLR_UB(src0, src1, src1, src2, vec1, vec0, vec3, vec2);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    PCKEV_ST_SB(tmp0, tmp1, dst);
 
-    tmp0 = __msa_dotp_u_h(vec0, filt0);
-    tmp1 = __msa_dotp_u_h(vec1, filt0);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    SAT_UH2_UH(tmp2, tmp3, 7);
+    PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
 
-    tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
-    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+    ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
+    ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+    SAT_UH2_UH(tmp4, tmp5, 7);
+    PCKEV_ST_SB(tmp4, tmp5, dst + 16);
 
-    PCKEV_B_STORE_VEC(tmp1, tmp0, dst);
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+    SAT_UH2_UH(tmp6, tmp7, 7);
+    PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
 
-    tmp2 = __msa_dotp_u_h(vec2, filt0);
-    tmp3 = __msa_dotp_u_h(vec3, filt0);
+    ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
+    ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    PCKEV_ST_SB(tmp0, tmp1, dst + 32);
 
-    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
-    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    SAT_UH2_UH(tmp2, tmp3, 7);
+    PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
 
-    PCKEV_B_STORE_VEC(tmp3, tmp2, dst + dst_stride);
+    ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
+    ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+    SAT_UH2_UH(tmp4, tmp5, 7);
+    PCKEV_ST_SB(tmp4, tmp5, dst + 48);
 
-    ILV_B_LRLR_UB(src3, src4, src4, src5, vec5, vec4, vec7, vec6);
-
-    tmp4 = __msa_dotp_u_h(vec4, filt0);
-    tmp5 = __msa_dotp_u_h(vec5, filt0);
-
-    tmp4 = SRARI_SATURATE_UNSIGNED_H(tmp4, FILTER_BITS, 7);
-    tmp5 = SRARI_SATURATE_UNSIGNED_H(tmp5, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_VEC(tmp5, tmp4, dst + 16);
-
-    tmp6 = __msa_dotp_u_h(vec6, filt0);
-    tmp7 = __msa_dotp_u_h(vec7, filt0);
-
-    tmp6 = SRARI_SATURATE_UNSIGNED_H(tmp6, FILTER_BITS, 7);
-    tmp7 = SRARI_SATURATE_UNSIGNED_H(tmp7, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_VEC(tmp7, tmp6, dst + 16 + dst_stride);
-
-    ILV_B_LRLR_UB(src6, src7, src7, src8, vec1, vec0, vec3, vec2);
-
-    tmp0 = __msa_dotp_u_h(vec0, filt0);
-    tmp1 = __msa_dotp_u_h(vec1, filt0);
-
-    tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
-    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 32);
-
-    tmp2 = __msa_dotp_u_h(vec2, filt0);
-    tmp3 = __msa_dotp_u_h(vec3, filt0);
-
-    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
-    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 32 + dst_stride);
-
-    ILV_B_LRLR_UB(src9, src10, src10, src11, vec5, vec4, vec7, vec6);
-
-    tmp4 = __msa_dotp_u_h(vec4, filt0);
-    tmp5 = __msa_dotp_u_h(vec5, filt0);
-
-    tmp4 = SRARI_SATURATE_UNSIGNED_H(tmp4, FILTER_BITS, 7);
-    tmp5 = SRARI_SATURATE_UNSIGNED_H(tmp5, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_VEC(tmp5, tmp4, dst + 48);
-
-    tmp6 = __msa_dotp_u_h(vec6, filt0);
-    tmp7 = __msa_dotp_u_h(vec7, filt0);
-
-    tmp6 = SRARI_SATURATE_UNSIGNED_H(tmp6, FILTER_BITS, 7);
-    tmp7 = SRARI_SATURATE_UNSIGNED_H(tmp7, FILTER_BITS, 7);
-
-    PCKEV_B_STORE_VEC(tmp7, tmp6, dst + 48 + dst_stride);
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+    SAT_UH2_UH(tmp6, tmp7, 7);
+    PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
     dst += (2 * dst_stride);
 
     src0 = src2;
--- a/vp9/common/mips/msa/vp9_convolve_avg_msa.c
+++ b/vp9/common/mips/msa/vp9_convolve_avg_msa.c
@@ -19,46 +19,35 @@
 
   if (0 == (height % 4)) {
     for (cnt = (height / 4); cnt--;) {
-      LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
       src += (4 * src_stride);
 
-      LOAD_4VECS_UB(dst, dst_stride, dst0, dst1, dst2, dst3);
+      LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
-      dst0 = __msa_aver_u_b(src0, dst0);
-      dst1 = __msa_aver_u_b(src1, dst1);
-      dst2 = __msa_aver_u_b(src2, dst2);
-      dst3 = __msa_aver_u_b(src3, dst3);
+      AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                  dst0, dst1, dst2, dst3);
 
       out0 = __msa_copy_u_w((v4i32)dst0, 0);
       out1 = __msa_copy_u_w((v4i32)dst1, 0);
       out2 = __msa_copy_u_w((v4i32)dst2, 0);
       out3 = __msa_copy_u_w((v4i32)dst3, 0);
-
-      STORE_WORD(dst, out0);
-      dst += dst_stride;
-      STORE_WORD(dst, out1);
-      dst += dst_stride;
-      STORE_WORD(dst, out2);
-      dst += dst_stride;
-      STORE_WORD(dst, out3);
-      dst += dst_stride;
+      SW4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
     }
   } else if (0 == (height % 2)) {
     for (cnt = (height / 2); cnt--;) {
-      LOAD_2VECS_UB(src, src_stride, src0, src1);
+      LD_UB2(src, src_stride, src0, src1);
       src += (2 * src_stride);
 
-      LOAD_2VECS_UB(dst, dst_stride, dst0, dst1);
+      LD_UB2(dst, dst_stride, dst0, dst1);
 
-      dst0 = __msa_aver_u_b(src0, dst0);
-      dst1 = __msa_aver_u_b(src1, dst1);
+      AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
 
       out0 = __msa_copy_u_w((v4i32)dst0, 0);
       out1 = __msa_copy_u_w((v4i32)dst1, 0);
-
-      STORE_WORD(dst, out0);
+      SW(out0, dst);
       dst += dst_stride;
-      STORE_WORD(dst, out1);
+      SW(out1, dst);
       dst += dst_stride;
     }
   }
@@ -72,29 +61,19 @@
   v16u8 dst0, dst1, dst2, dst3;
 
   for (cnt = (height / 4); cnt--;) {
-    LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
     src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
 
-    LOAD_4VECS_UB(dst, dst_stride, dst0, dst1, dst2, dst3);
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                dst0, dst1, dst2, dst3);
 
-    dst0 = __msa_aver_u_b(src0, dst0);
-    dst1 = __msa_aver_u_b(src1, dst1);
-    dst2 = __msa_aver_u_b(src2, dst2);
-    dst3 = __msa_aver_u_b(src3, dst3);
-
     out0 = __msa_copy_u_d((v2i64)dst0, 0);
     out1 = __msa_copy_u_d((v2i64)dst1, 0);
     out2 = __msa_copy_u_d((v2i64)dst2, 0);
     out3 = __msa_copy_u_d((v2i64)dst3, 0);
-
-    STORE_DWORD(dst, out0);
-    dst += dst_stride;
-    STORE_DWORD(dst, out1);
-    dst += dst_stride;
-    STORE_DWORD(dst, out2);
-    dst += dst_stride;
-    STORE_DWORD(dst, out3);
-    dst += dst_stride;
+    SD4(out0, out1, out2, out3, dst, dst_stride);
+    dst += (4 * dst_stride);
   }
 }
 
@@ -105,24 +84,15 @@
   v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
   for (cnt = (height / 8); cnt--;) {
-    LOAD_8VECS_UB(src, src_stride,
-                  src0, src1, src2, src3, src4, src5, src6, src7);
+    LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
     src += (8 * src_stride);
+    LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
 
-    LOAD_8VECS_UB(dst, dst_stride,
-                  dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-
-    dst0 = __msa_aver_u_b(src0, dst0);
-    dst1 = __msa_aver_u_b(src1, dst1);
-    dst2 = __msa_aver_u_b(src2, dst2);
-    dst3 = __msa_aver_u_b(src3, dst3);
-    dst4 = __msa_aver_u_b(src4, dst4);
-    dst5 = __msa_aver_u_b(src5, dst5);
-    dst6 = __msa_aver_u_b(src6, dst6);
-    dst7 = __msa_aver_u_b(src7, dst7);
-
-    STORE_8VECS_UB(dst, dst_stride,
-                   dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                dst0, dst1, dst2, dst3);
+    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                dst4, dst5, dst6, dst7);
+    ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
     dst += (8 * dst_stride);
   }
 }
@@ -137,99 +107,34 @@
   v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
 
   for (cnt = (height / 8); cnt--;) {
-    src0 = LOAD_UB(src);
-    src1 = LOAD_UB(src + 16);
-    src += src_stride;
-    src2 = LOAD_UB(src);
-    src3 = LOAD_UB(src + 16);
-    src += src_stride;
-    src4 = LOAD_UB(src);
-    src5 = LOAD_UB(src + 16);
-    src += src_stride;
-    src6 = LOAD_UB(src);
-    src7 = LOAD_UB(src + 16);
-    src += src_stride;
+    LD_UB4(src, src_stride, src0, src2, src4, src6);
+    LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
+    LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
+    dst_dup += (4 * dst_stride);
+    LD_UB4(src, src_stride, src8, src10, src12, src14);
+    LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
+    src += (4 * src_stride);
+    LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
+    LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
+    dst_dup += (4 * dst_stride);
 
-    dst0 = LOAD_UB(dst_dup);
-    dst1 = LOAD_UB(dst_dup + 16);
-    dst_dup += dst_stride;
-    dst2 = LOAD_UB(dst_dup);
-    dst3 = LOAD_UB(dst_dup + 16);
-    dst_dup += dst_stride;
-    dst4 = LOAD_UB(dst_dup);
-    dst5 = LOAD_UB(dst_dup + 16);
-    dst_dup += dst_stride;
-    dst6 = LOAD_UB(dst_dup);
-    dst7 = LOAD_UB(dst_dup + 16);
-    dst_dup += dst_stride;
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                dst0, dst1, dst2, dst3);
+    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                dst4, dst5, dst6, dst7);
+    AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
+                dst8, dst9, dst10, dst11);
+    AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
+                dst12, dst13, dst14, dst15);
 
-    src8 = LOAD_UB(src);
-    src9 = LOAD_UB(src + 16);
-    src += src_stride;
-    src10 = LOAD_UB(src);
-    src11 = LOAD_UB(src + 16);
-    src += src_stride;
-    src12 = LOAD_UB(src);
-    src13 = LOAD_UB(src + 16);
-    src += src_stride;
-    src14 = LOAD_UB(src);
-    src15 = LOAD_UB(src + 16);
-    src += src_stride;
-
-    dst8 = LOAD_UB(dst_dup);
-    dst9 = LOAD_UB(dst_dup + 16);
-    dst_dup += dst_stride;
-    dst10 = LOAD_UB(dst_dup);
-    dst11 = LOAD_UB(dst_dup + 16);
-    dst_dup += dst_stride;
-    dst12 = LOAD_UB(dst_dup);
-    dst13 = LOAD_UB(dst_dup + 16);
-    dst_dup += dst_stride;
-    dst14 = LOAD_UB(dst_dup);
-    dst15 = LOAD_UB(dst_dup + 16);
-    dst_dup += dst_stride;
-
-    dst0 = __msa_aver_u_b(src0, dst0);
-    dst1 = __msa_aver_u_b(src1, dst1);
-    dst2 = __msa_aver_u_b(src2, dst2);
-    dst3 = __msa_aver_u_b(src3, dst3);
-    dst4 = __msa_aver_u_b(src4, dst4);
-    dst5 = __msa_aver_u_b(src5, dst5);
-    dst6 = __msa_aver_u_b(src6, dst6);
-    dst7 = __msa_aver_u_b(src7, dst7);
-    dst8 = __msa_aver_u_b(src8, dst8);
-    dst9 = __msa_aver_u_b(src9, dst9);
-    dst10 = __msa_aver_u_b(src10, dst10);
-    dst11 = __msa_aver_u_b(src11, dst11);
-    dst12 = __msa_aver_u_b(src12, dst12);
-    dst13 = __msa_aver_u_b(src13, dst13);
-    dst14 = __msa_aver_u_b(src14, dst14);
-    dst15 = __msa_aver_u_b(src15, dst15);
-
-    STORE_UB(dst0, dst);
-    STORE_UB(dst1, dst + 16);
-    dst += dst_stride;
-    STORE_UB(dst2, dst);
-    STORE_UB(dst3, dst + 16);
-    dst += dst_stride;
-    STORE_UB(dst4, dst);
-    STORE_UB(dst5, dst + 16);
-    dst += dst_stride;
-    STORE_UB(dst6, dst);
-    STORE_UB(dst7, dst + 16);
-    dst += dst_stride;
-    STORE_UB(dst8, dst);
-    STORE_UB(dst9, dst + 16);
-    dst += dst_stride;
-    STORE_UB(dst10, dst);
-    STORE_UB(dst11, dst + 16);
-    dst += dst_stride;
-    STORE_UB(dst12, dst);
-    STORE_UB(dst13, dst + 16);
-    dst += dst_stride;
-    STORE_UB(dst14, dst);
-    STORE_UB(dst15, dst + 16);
-    dst += dst_stride;
+    ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
+    ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
+    dst += (4 * dst_stride);
+    ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
+    ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
+    dst += (4 * dst_stride);
   }
 }
 
@@ -243,48 +148,40 @@
   v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
 
   for (cnt = (height / 4); cnt--;) {
-    LOAD_4VECS_UB(src, 16, src0, src1, src2, src3);
+    LD_UB4(src, 16, src0, src1, src2, src3);
     src += src_stride;
-    LOAD_4VECS_UB(src, 16, src4, src5, src6, src7);
+    LD_UB4(src, 16, src4, src5, src6, src7);
     src += src_stride;
-    LOAD_4VECS_UB(src, 16, src8, src9, src10, src11);
+    LD_UB4(src, 16, src8, src9, src10, src11);
     src += src_stride;
-    LOAD_4VECS_UB(src, 16, src12, src13, src14, src15);
+    LD_UB4(src, 16, src12, src13, src14, src15);
     src += src_stride;
 
-    LOAD_4VECS_UB(dst_dup, 16, dst0, dst1, dst2, dst3);
+    LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
     dst_dup += dst_stride;
-    LOAD_4VECS_UB(dst_dup, 16, dst4, dst5, dst6, dst7);
+    LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
     dst_dup += dst_stride;
-    LOAD_4VECS_UB(dst_dup, 16, dst8, dst9, dst10, dst11);
+    LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
     dst_dup += dst_stride;
-    LOAD_4VECS_UB(dst_dup, 16, dst12, dst13, dst14, dst15);
+    LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
     dst_dup += dst_stride;
 
-    dst0 = __msa_aver_u_b(src0, dst0);
-    dst1 = __msa_aver_u_b(src1, dst1);
-    dst2 = __msa_aver_u_b(src2, dst2);
-    dst3 = __msa_aver_u_b(src3, dst3);
-    dst4 = __msa_aver_u_b(src4, dst4);
-    dst5 = __msa_aver_u_b(src5, dst5);
-    dst6 = __msa_aver_u_b(src6, dst6);
-    dst7 = __msa_aver_u_b(src7, dst7);
-    dst8 = __msa_aver_u_b(src8, dst8);
-    dst9 = __msa_aver_u_b(src9, dst9);
-    dst10 = __msa_aver_u_b(src10, dst10);
-    dst11 = __msa_aver_u_b(src11, dst11);
-    dst12 = __msa_aver_u_b(src12, dst12);
-    dst13 = __msa_aver_u_b(src13, dst13);
-    dst14 = __msa_aver_u_b(src14, dst14);
-    dst15 = __msa_aver_u_b(src15, dst15);
+    AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                dst0, dst1, dst2, dst3);
+    AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                dst4, dst5, dst6, dst7);
+    AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
+                dst8, dst9, dst10, dst11);
+    AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
+                dst12, dst13, dst14, dst15);
 
-    STORE_4VECS_UB(dst, 16, dst0, dst1, dst2, dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
     dst += dst_stride;
-    STORE_4VECS_UB(dst, 16, dst4, dst5, dst6, dst7);
+    ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
     dst += dst_stride;
-    STORE_4VECS_UB(dst, 16, dst8, dst9, dst10, dst11);
+    ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
     dst += dst_stride;
-    STORE_4VECS_UB(dst, 16, dst12, dst13, dst14, dst15);
+    ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
     dst += dst_stride;
   }
 }
--- a/vp9/common/mips/msa/vp9_convolve_copy_msa.c
+++ b/vp9/common/mips/msa/vp9_convolve_copy_msa.c
@@ -12,8 +12,7 @@
 #include "vp9/common/mips/msa/vp9_macros_msa.h"
 
 static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
-                            uint8_t *dst, int32_t dst_stride,
-                            int32_t height) {
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
   int32_t cnt;
   uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
@@ -20,8 +19,7 @@
 
   if (0 == height % 12) {
     for (cnt = (height / 12); cnt--;) {
-      LOAD_8VECS_UB(src, src_stride,
-                    src0, src1, src2, src3, src4, src5, src6, src7);
+      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
       src += (8 * src_stride);
 
       out0 = __msa_copy_u_d((v2i64)src0, 0);
@@ -33,24 +31,12 @@
       out6 = __msa_copy_u_d((v2i64)src6, 0);
       out7 = __msa_copy_u_d((v2i64)src7, 0);
 
-      STORE_DWORD(dst, out0);
-      dst += dst_stride;
-      STORE_DWORD(dst, out1);
-      dst += dst_stride;
-      STORE_DWORD(dst, out2);
-      dst += dst_stride;
-      STORE_DWORD(dst, out3);
-      dst += dst_stride;
-      STORE_DWORD(dst, out4);
-      dst += dst_stride;
-      STORE_DWORD(dst, out5);
-      dst += dst_stride;
-      STORE_DWORD(dst, out6);
-      dst += dst_stride;
-      STORE_DWORD(dst, out7);
-      dst += dst_stride;
+      SD4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
+      SD4(out4, out5, out6, out7, dst, dst_stride);
+      dst += (4 * dst_stride);
 
-      LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
       src += (4 * src_stride);
 
       out0 = __msa_copy_u_d((v2i64)src0, 0);
@@ -57,20 +43,12 @@
       out1 = __msa_copy_u_d((v2i64)src1, 0);
       out2 = __msa_copy_u_d((v2i64)src2, 0);
       out3 = __msa_copy_u_d((v2i64)src3, 0);
-
-      STORE_DWORD(dst, out0);
-      dst += dst_stride;
-      STORE_DWORD(dst, out1);
-      dst += dst_stride;
-      STORE_DWORD(dst, out2);
-      dst += dst_stride;
-      STORE_DWORD(dst, out3);
-      dst += dst_stride;
+      SD4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
     }
   } else if (0 == height % 8) {
     for (cnt = height >> 3; cnt--;) {
-      LOAD_8VECS_UB(src, src_stride,
-                    src0, src1, src2, src3, src4, src5, src6, src7);
+      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
       src += (8 * src_stride);
 
       out0 = __msa_copy_u_d((v2i64)src0, 0);
@@ -82,53 +60,33 @@
       out6 = __msa_copy_u_d((v2i64)src6, 0);
       out7 = __msa_copy_u_d((v2i64)src7, 0);
 
-      STORE_DWORD(dst, out0);
-      dst += dst_stride;
-      STORE_DWORD(dst, out1);
-      dst += dst_stride;
-      STORE_DWORD(dst, out2);
-      dst += dst_stride;
-      STORE_DWORD(dst, out3);
-      dst += dst_stride;
-      STORE_DWORD(dst, out4);
-      dst += dst_stride;
-      STORE_DWORD(dst, out5);
-      dst += dst_stride;
-      STORE_DWORD(dst, out6);
-      dst += dst_stride;
-      STORE_DWORD(dst, out7);
-      dst += dst_stride;
+      SD4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
+      SD4(out4, out5, out6, out7, dst, dst_stride);
+      dst += (4 * dst_stride);
     }
   } else if (0 == height % 4) {
     for (cnt = (height / 4); cnt--;) {
-      LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
       src += (4 * src_stride);
-
       out0 = __msa_copy_u_d((v2i64)src0, 0);
       out1 = __msa_copy_u_d((v2i64)src1, 0);
       out2 = __msa_copy_u_d((v2i64)src2, 0);
       out3 = __msa_copy_u_d((v2i64)src3, 0);
 
-      STORE_DWORD(dst, out0);
-      dst += dst_stride;
-      STORE_DWORD(dst, out1);
-      dst += dst_stride;
-      STORE_DWORD(dst, out2);
-      dst += dst_stride;
-      STORE_DWORD(dst, out3);
-      dst += dst_stride;
+      SD4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
     }
   } else if (0 == height % 2) {
     for (cnt = (height / 2); cnt--;) {
-      LOAD_2VECS_UB(src, src_stride, src0, src1);
+      LD_UB2(src, src_stride, src0, src1);
       src += (2 * src_stride);
-
       out0 = __msa_copy_u_d((v2i64)src0, 0);
       out1 = __msa_copy_u_d((v2i64)src1, 0);
 
-      STORE_DWORD(dst, out0);
+      SD(out0, dst);
       dst += dst_stride;
-      STORE_DWORD(dst, out1);
+      SD(out1, dst);
       dst += dst_stride;
     }
   }
@@ -147,12 +105,12 @@
     dst_tmp = dst;
 
     for (loop_cnt = (height >> 3); loop_cnt--;) {
-      LOAD_8VECS_UB(src_tmp, src_stride,
-                    src0, src1, src2, src3, src4, src5, src6, src7);
+      LD_UB8(src_tmp, src_stride,
+             src0, src1, src2, src3, src4, src5, src6, src7);
       src_tmp += (8 * src_stride);
 
-      STORE_8VECS_UB(dst_tmp, dst_stride,
-                     src0, src1, src2, src3, src4, src5, src6, src7);
+      ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
+             dst_tmp, dst_stride);
       dst_tmp += (8 * dst_stride);
     }
 
@@ -162,25 +120,20 @@
 }
 
 static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
-                             uint8_t *dst, int32_t dst_stride,
-                             int32_t height) {
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
   int32_t cnt;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
 
   if (0 == height % 12) {
     for (cnt = (height / 12); cnt--;) {
-      LOAD_8VECS_UB(src, src_stride,
-                    src0, src1, src2, src3, src4, src5, src6, src7);
+      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
       src += (8 * src_stride);
-
-      STORE_8VECS_UB(dst, dst_stride,
-                     src0, src1, src2, src3, src4, src5, src6, src7);
+      ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
       dst += (8 * dst_stride);
 
-      LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
       src += (4 * src_stride);
-
-      STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
       dst += (4 * dst_stride);
     }
   } else if (0 == height % 8) {
@@ -187,10 +140,10 @@
     copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
   } else if (0 == height % 4) {
     for (cnt = (height >> 2); cnt--;) {
-      LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
       src += (4 * src_stride);
 
-      STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
       dst += (4 * dst_stride);
     }
   }
@@ -197,35 +150,31 @@
 }
 
 static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
-                             uint8_t *dst, int32_t dst_stride,
-                             int32_t height) {
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
   int32_t cnt;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
 
   if (0 == height % 12) {
     for (cnt = (height / 12); cnt--;) {
-      LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
-      LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7);
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
       src += (4 * src_stride);
-
-      STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
-      STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
       dst += (4 * dst_stride);
 
-      LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
-      LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7);
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
       src += (4 * src_stride);
-
-      STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
-      STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
       dst += (4 * dst_stride);
 
-      LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
-      LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7);
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
       src += (4 * src_stride);
-
-      STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
-      STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
       dst += (4 * dst_stride);
     }
   } else if (0 == height % 8) {
@@ -232,12 +181,11 @@
     copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
   } else if (0 == height % 4) {
     for (cnt = (height >> 2); cnt--;) {
-      LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
-      LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7);
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
       src += (4 * src_stride);
-
-      STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
-      STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
       dst += (4 * dst_stride);
     }
   }
@@ -244,8 +192,7 @@
 }
 
 static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
-                             uint8_t *dst, int32_t dst_stride,
-                             int32_t height) {
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
   copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
 }
 
@@ -264,8 +211,8 @@
       uint32_t cnt, tmp;
       /* 1 word storage */
       for (cnt = h; cnt--;) {
-        tmp = LOAD_WORD(src);
-        STORE_WORD(dst, tmp);
+        tmp = LW(src);
+        SW(tmp, dst);
         src += src_stride;
         dst += dst_stride;
       }
--- a/vp9/common/mips/msa/vp9_convolve_msa.h
+++ b/vp9/common/mips/msa/vp9_convolve_msa.h
@@ -16,142 +16,104 @@
 
 extern const uint8_t mc_filt_mask_arr[16 * 3];
 
-#define HORIZ_8TAP_FILT(src, mask0, mask1, mask2, mask3,                   \
-                        filt_h0, filt_h1, filt_h2, filt_h3) ({             \
-  v8i16 vec0, vec1, vec2, vec3, horiz_out;                                 \
-                                                                           \
-  vec0 = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src), (v16i8)(src));  \
-  vec0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)(filt_h0));                    \
-  vec1 = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src), (v16i8)(src));  \
-  vec0 = __msa_dpadd_s_h(vec0, (v16i8)(filt_h1), (v16i8)vec1);             \
-  vec2 = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src), (v16i8)(src));  \
-  vec2 = __msa_dotp_s_h((v16i8)vec2, (v16i8)(filt_h2));                    \
-  vec3 = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src), (v16i8)(src));  \
-  vec2 = __msa_dpadd_s_h(vec2, (v16i8)(filt_h3), (v16i8)vec3);             \
-  vec0 = __msa_adds_s_h(vec0, vec2);                                       \
-  horiz_out = SRARI_SATURATE_SIGNED_H(vec0, FILTER_BITS, 7);               \
-                                                                           \
-  horiz_out;                                                               \
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,         \
+                            filt0, filt1, filt2, filt3) ({  \
+  v8i16 tmp0, tmp1;                                         \
+                                                            \
+  tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);         \
+  tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1);  \
+  tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2);         \
+  tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3);  \
+  tmp0 = __msa_adds_s_h(tmp0, tmp1);                        \
+                                                            \
+  tmp0;                                                     \
 })
 
-#define HORIZ_8TAP_FILT_2VECS(src0, src1, mask0, mask1, mask2, mask3,        \
-                              filt_h0, filt_h1, filt_h2, filt_h3) ({         \
-  v8i16 vec0, vec1, vec2, vec3, horiz_out;                                   \
-                                                                             \
-  vec0 = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src0));  \
-  vec0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)(filt_h0));                      \
-  vec1 = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src0));  \
-  vec0 = __msa_dpadd_s_h(vec0, (v16i8)(filt_h1), (v16i8)vec1);               \
-  vec2 = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src0));  \
-  vec2 = __msa_dotp_s_h((v16i8)vec2, (v16i8)(filt_h2));                      \
-  vec3 = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src0));  \
-  vec2 = __msa_dpadd_s_h(vec2, ((v16i8)filt_h3), (v16i8)vec3);               \
-  vec0 = __msa_adds_s_h(vec0, vec2);                                         \
-  horiz_out = (v8i16)SRARI_SATURATE_SIGNED_H(vec0, FILTER_BITS, 7);          \
-                                                                             \
-  horiz_out;                                                                 \
+#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3,        \
+                        filt_h0, filt_h1, filt_h2, filt_h3) ({         \
+  v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                \
+  v8i16 hz_out_m;                                                      \
+                                                                       \
+  VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,                   \
+             vec0_m, vec1_m, vec2_m, vec3_m);                          \
+  hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m,       \
+                                 filt_h0, filt_h1, filt_h2, filt_h3);  \
+                                                                       \
+  hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS);                     \
+  hz_out_m = __msa_sat_s_h(hz_out_m, 7);                               \
+                                                                       \
+  hz_out_m;                                                            \
 })
 
-#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,             \
-                            filt0, filt1, filt2, filt3) ({      \
-  v8i16 tmp0, tmp1;                                             \
-                                                                \
-  tmp0 = __msa_dotp_s_h((v16i8)(vec0), (v16i8)(filt0));         \
-  tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)(vec1), (v16i8)(filt1));  \
-  tmp1 = __msa_dotp_s_h((v16i8)(vec2), (v16i8)(filt2));         \
-  tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)(vec3), ((v16i8)filt3));  \
-  tmp0 = __msa_adds_s_h(tmp0, tmp1);                            \
-                                                                \
-  tmp0;                                                         \
-})
+#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,               \
+                                   mask0, mask1, mask2, mask3,           \
+                                   filt0, filt1, filt2, filt3,           \
+                                   out0, out1) {                         \
+  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;  \
+  v8i16 res0_m, res1_m, res2_m, res3_m;                                  \
+                                                                         \
+  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);      \
+  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m);             \
+  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);      \
+  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m);            \
+  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);      \
+  DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m);             \
+  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);      \
+  DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m);            \
+  ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);               \
+}
 
-#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,                     \
-                                   mask0, mask1, mask2, mask3,                 \
-                                   filt0, filt1, filt2, filt3,                 \
-                                   out0, out1) {                               \
-  v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;        \
-  v8i16 res0_m, res1_m, res2_m, res3_m;                                        \
-                                                                               \
-  vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src0));  \
-  vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src3), (v16i8)(src2));  \
-                                                                               \
-  res0_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt0));                      \
-  res1_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt0));                      \
-                                                                               \
-  vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src0));  \
-  vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src3), (v16i8)(src2));  \
-                                                                               \
-  res0_m = __msa_dpadd_s_h(res0_m, (filt1), (v16i8)vec2_m);                    \
-  res1_m = __msa_dpadd_s_h(res1_m, (filt1), (v16i8)vec3_m);                    \
-                                                                               \
-  vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src0));  \
-  vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src3), (v16i8)(src2));  \
-                                                                               \
-  res2_m = __msa_dotp_s_h((v16i8)(filt2), (v16i8)vec4_m);                      \
-  res3_m = __msa_dotp_s_h((v16i8)(filt2), (v16i8)vec5_m);                      \
-                                                                               \
-  vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src0));  \
-  vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src3), (v16i8)(src2));  \
-                                                                               \
-  res2_m = __msa_dpadd_s_h(res2_m, (v16i8)(filt3), (v16i8)vec6_m);             \
-  res3_m = __msa_dpadd_s_h(res3_m, (v16i8)(filt3), (v16i8)vec7_m);             \
-                                                                               \
-  out0 = __msa_adds_s_h(res0_m, res2_m);                                       \
-  out1 = __msa_adds_s_h(res1_m, res3_m);                                       \
+#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                  \
+                                   mask0, mask1, mask2, mask3,              \
+                                   filt0, filt1, filt2, filt3,              \
+                                   out0, out1, out2, out3) {                \
+  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
+  v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;     \
+                                                                            \
+  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
+  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
+  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
+              res0_m, res1_m, res2_m, res3_m);                              \
+  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);         \
+  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);         \
+  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,   \
+              res4_m, res5_m, res6_m, res7_m);                              \
+  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);         \
+  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);         \
+  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,  \
+               res0_m, res1_m, res2_m, res3_m);                             \
+  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);         \
+  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);         \
+  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,  \
+               res4_m, res5_m, res6_m, res7_m);                             \
+  ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m,       \
+              res7_m, out0, out1, out2, out3);                              \
 }
 
-#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                     \
-                                   mask0, mask1, mask2, mask3,                 \
-                                   filt0, filt1, filt2, filt3,                 \
-                                   out0, out1, out2, out3) {                   \
-  v8i16 vec0_m, vec1_m, vec2_m, vec3_m;                                        \
-  v8i16 vec4_m, vec5_m, vec6_m, vec7_m;                                        \
-  v8i16 res0_m, res1_m, res2_m, res3_m;                                        \
-  v8i16 res4_m, res5_m, res6_m, res7_m;                                        \
-                                                                               \
-  vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src0), (v16i8)(src0));  \
-  vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src1));  \
-  vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src2), (v16i8)(src2));  \
-  vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src3), (v16i8)(src3));  \
-                                                                               \
-  res0_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt0));                      \
-  res1_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt0));                      \
-  res2_m = __msa_dotp_s_h((v16i8)vec2_m, (v16i8)(filt0));                      \
-  res3_m = __msa_dotp_s_h((v16i8)vec3_m, (v16i8)(filt0));                      \
-                                                                               \
-  vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src0), (v16i8)(src0));  \
-  vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src1));  \
-  vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src2), (v16i8)(src2));  \
-  vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src3), (v16i8)(src3));  \
-                                                                               \
-  res4_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt2));                      \
-  res5_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt2));                      \
-  res6_m = __msa_dotp_s_h((v16i8)vec2_m, (v16i8)(filt2));                      \
-  res7_m = __msa_dotp_s_h((v16i8)vec3_m, (v16i8)(filt2));                      \
-                                                                               \
-  vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src0), (v16i8)(src0));  \
-  vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src1));  \
-  vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src2), (v16i8)(src2));  \
-  vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src3), (v16i8)(src3));  \
-                                                                               \
-  res0_m = __msa_dpadd_s_h(res0_m, (v16i8)(filt1), (v16i8)vec4_m);             \
-  res1_m = __msa_dpadd_s_h(res1_m, (v16i8)(filt1), (v16i8)vec5_m);             \
-  res2_m = __msa_dpadd_s_h(res2_m, (v16i8)(filt1), (v16i8)vec6_m);             \
-  res3_m = __msa_dpadd_s_h(res3_m, (v16i8)(filt1), (v16i8)vec7_m);             \
-                                                                               \
-  vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src0), (v16i8)(src0));  \
-  vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src1));  \
-  vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src2), (v16i8)(src2));  \
-  vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src3), (v16i8)(src3));  \
-                                                                               \
-  res4_m = __msa_dpadd_s_h(res4_m, (v16i8)(filt3), (v16i8)vec4_m);             \
-  res5_m = __msa_dpadd_s_h(res5_m, (v16i8)(filt3), (v16i8)vec5_m);             \
-  res6_m = __msa_dpadd_s_h(res6_m, (v16i8)(filt3), (v16i8)vec6_m);             \
-  res7_m = __msa_dpadd_s_h(res7_m, (v16i8)(filt3), (v16i8)vec7_m);             \
-                                                                               \
-  out0 = __msa_adds_s_h(res0_m, res4_m);                                       \
-  out1 = __msa_adds_s_h(res1_m, res5_m);                                       \
-  out2 = __msa_adds_s_h(res2_m, res6_m);                                       \
-  out3 = __msa_adds_s_h(res3_m, res7_m);                                       \
+#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) {  \
+  v16u8 tmp_m;                                          \
+                                                        \
+  tmp_m = PCKEV_XORI128_UB(in1, in0);                   \
+  tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst);            \
+  ST_UB(tmp_m, (pdst));                                 \
+}
+
+#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) {           \
+  v16u8 tmp_m;                                           \
+                                                         \
+  tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1);  \
+  tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst);             \
+  ST_UB(tmp_m, (pdst));                                  \
+}
+
+#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3,  \
+                           pdst, stride) {                              \
+  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
+  uint8_t *pdst_m = (uint8_t *)(pdst);                                  \
+                                                                        \
+  PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                      \
+  PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                  \
+  AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);          \
+  ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                             \
 }
 #endif  /* VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ */
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -286,19 +286,19 @@
 # Sub Pixel Filters
 #
 add_proto qw/void vp9_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve_copy neon dspr2/, "$sse2_x86inc";
+specialize qw/vp9_convolve_copy neon dspr2 msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve_avg neon dspr2/, "$sse2_x86inc";
+specialize qw/vp9_convolve_avg neon dspr2 msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8 sse2 ssse3 neon dspr2/, "$avx2_ssse3";
+specialize qw/vp9_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
 
 add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_horiz sse2 ssse3 neon dspr2/, "$avx2_ssse3";
+specialize qw/vp9_convolve8_horiz sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
 
 add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_vert sse2 ssse3 neon dspr2/, "$avx2_ssse3";
+specialize qw/vp9_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
 
 add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 specialize qw/vp9_convolve8_avg sse2 ssse3 neon dspr2/;
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -132,6 +132,12 @@
 
 # common (msa)
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_macros_msa.h
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_horiz_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_vert_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_avg_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_copy_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_msa.h
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct32x32_msa.c