shithub: libvpx

Download patch

ref: 814e1346a61c9c5a1fe2bd2ccbfd025a398ed636
parent: cc4c5de22f76e53b1e816f21fcdb910934855c00
parent: 1579bb88c5f2015e3d947d2d067824426f945715
author: Parag Salasakar <[email protected]>
date: Tue Aug 4 00:30:22 EDT 2015

Merge "mips msa vpx convolve optimzation"

--- a/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
@@ -323,7 +323,7 @@
                                               int8_t *filter) {
   v16i8 src0, src1, src2, src3, mask;
   v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
-  v8u16 vec2, vec3, const255, filt;
+  v8u16 vec2, vec3, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[16]);
 
@@ -331,14 +331,11 @@
   filt = LD_UH(filter);
   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
-  const255 = (v8u16)__msa_ldi_h(255);
-
   LD_SB4(src, src_stride, src0, src1, src2, src3);
   LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
   DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
   SRARI_H2_UH(vec2, vec3, FILTER_BITS);
-  MIN_UH2_UH(vec2, vec3, const255);
   PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
   ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
   AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
@@ -353,7 +350,7 @@
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
   v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
   v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
-  v8u16 vec4, vec5, vec6, vec7, const255, filt;
+  v8u16 vec4, vec5, vec6, vec7, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[16]);
 
@@ -361,8 +358,6 @@
   filt = LD_UH(filter);
   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
-  const255 = (v8u16)__msa_ldi_h(255);
-
   LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
   LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
@@ -370,7 +365,6 @@
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
               vec6, vec7);
   SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
-  MIN_UH4_UH(vec4, vec5, vec6, vec7, const255);
   PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
               res3);
   ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
@@ -402,7 +396,7 @@
                                               int8_t *filter) {
   v16i8 src0, src1, src2, src3, mask;
   v16u8 filt0, dst0, dst1, dst2, dst3;
-  v8u16 vec0, vec1, vec2, vec3, const255, filt;
+  v8u16 vec0, vec1, vec2, vec3, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[0]);
 
@@ -410,8 +404,6 @@
   filt = LD_UH(filter);
   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
-  const255 = (v8u16)__msa_ldi_h(255);
-
   LD_SB4(src, src_stride, src0, src1, src2, src3);
   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
   VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
@@ -419,7 +411,6 @@
               vec2, vec3);
   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
   LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
   PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
                      dst, dst_stride);
 }
@@ -432,7 +423,7 @@
                                                   int32_t height) {
   v16i8 src0, src1, src2, src3, mask;
   v16u8 filt0, dst0, dst1, dst2, dst3;
-  v8u16 vec0, vec1, vec2, vec3, const255, filt;
+  v8u16 vec0, vec1, vec2, vec3, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[0]);
 
@@ -440,8 +431,6 @@
   filt = LD_UH(filter);
   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
-  const255 = (v8u16)__msa_ldi_h(255);
-
   LD_SB4(src, src_stride, src0, src1, src2, src3);
   src += (4 * src_stride);
   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
@@ -450,7 +439,6 @@
               vec2, vec3);
   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
   LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
   LD_SB4(src, src_stride, src0, src1, src2, src3);
   src += (4 * src_stride);
   PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
@@ -463,7 +451,6 @@
               vec2, vec3);
   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
   LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
   PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
                      dst, dst_stride);
   dst += (4 * dst_stride);
@@ -478,7 +465,6 @@
                 vec2, vec3);
     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
     LD_SB4(src, src_stride, src0, src1, src2, src3);
     PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
                        dst, dst_stride);
@@ -490,7 +476,6 @@
                 vec2, vec3);
     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
     PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
                        dst, dst_stride);
   }
@@ -520,7 +505,7 @@
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
   v16u8 filt0, dst0, dst1, dst2, dst3;
   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, const255, filt;
+  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[0]);
 
@@ -528,8 +513,6 @@
   filt = LD_UH(filter);
   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
-  const255 = (v8u16)__msa_ldi_h(255);
-
   LD_SB4(src, src_stride, src0, src2, src4, src6);
   LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
   src += (4 * src_stride);
@@ -545,8 +528,6 @@
   SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
   SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
   LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  MIN_UH4_UH(res0, res1, res2, res3, const255);
-  MIN_UH4_UH(res4, res5, res6, res7, const255);
   PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
   dst += dst_stride;
   PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
@@ -572,8 +553,6 @@
     SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
     SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    MIN_UH4_UH(res0, res1, res2, res3, const255);
-    MIN_UH4_UH(res4, res5, res6, res7, const255);
     PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
     dst += dst_stride;
     PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
@@ -595,7 +574,7 @@
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
   v16u8 filt0, dst0, dst1, dst2, dst3;
   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, const255, filt;
+  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[0]);
 
@@ -603,8 +582,6 @@
   filt = LD_UH(filter);
   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
-  const255 = (v8u16)__msa_ldi_h(255);
-
   for (loop_cnt = (height >> 1); loop_cnt--;) {
     src0 = LD_SB(src);
     src2 = LD_SB(src + 16);
@@ -627,8 +604,6 @@
                 res6, res7);
     SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS);
     SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS);
-    MIN_UH4_UH(res0, res1, res2, res3, const255);
-    MIN_UH4_UH(res4, res5, res6, res7, const255);
     LD_UB2(dst, 16, dst0, dst1);
     PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
     PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
@@ -650,7 +625,7 @@
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
   v16u8 filt0, dst0, dst1, dst2, dst3;
   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, const255, filt;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[0]);
 
@@ -658,8 +633,6 @@
   filt = LD_UH(filter);
   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
-  const255 = (v8u16)__msa_ldi_h(255);
-
   for (loop_cnt = height; loop_cnt--;) {
     LD_SB4(src, 16, src0, src2, src4, src6);
     src7 = LD_SB(src + 56);
@@ -677,8 +650,6 @@
     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
     LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
-    MIN_UH4_UH(out0, out1, out2, out3, const255);
-    MIN_UH4_UH(out4, out5, out6, out7, const255);
     PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
     PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
     PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
--- a/vpx_dsp/mips/vpx_convolve8_avg_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_msa.c
@@ -274,7 +274,6 @@
   ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
   DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
   SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-  SAT_UH2_UH(tmp0, tmp1, 7);
   PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
   AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
@@ -323,7 +322,6 @@
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
               tmp0, tmp1, tmp2, tmp3);
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
   PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1,
               res2, res3);
   AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
@@ -391,7 +389,6 @@
   tmp3 = __msa_dotp_u_h(vec3, filt_vt);
 
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
   PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
                      dst, dst_stride);
 }
@@ -436,7 +433,6 @@
     tmp1 = __msa_dotp_u_h(vec0, filt_vt);
 
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
 
     hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
@@ -447,7 +443,6 @@
     tmp3 = __msa_dotp_u_h(vec0, filt_vt);
 
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
     PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
                        dst, dst_stride);
@@ -511,7 +506,6 @@
     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
     PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
     dst += dst_stride;
 
@@ -520,7 +514,6 @@
     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
     PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
     dst += dst_stride;
 
@@ -529,7 +522,6 @@
     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
     PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
     dst += dst_stride;
 
@@ -538,7 +530,6 @@
     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
     PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
     dst += dst_stride;
   }
--- a/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
@@ -283,7 +283,6 @@
   ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
   DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
   SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-  SAT_UH2_UH(tmp0, tmp1, 7);
 
   out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
   out = __msa_aver_u_b(out, dst0);
@@ -323,7 +322,6 @@
   DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
               tmp0, tmp1, tmp2, tmp3);
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
   PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
   AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
   ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
@@ -365,7 +363,6 @@
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
               tmp2, tmp3);
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
   PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
                      dst, dst_stride);
 }
@@ -402,7 +399,6 @@
     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
                 tmp2, tmp3);
     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
     PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4,
                        dst, dst_stride);
     dst += (4 * dst_stride);
@@ -410,7 +406,6 @@
     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
                 tmp2, tmp3);
     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
     PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8,
                        dst, dst_stride);
     dst += (4 * dst_stride);
@@ -460,7 +455,6 @@
     ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
     DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
     PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
     dst += dst_stride;
 
@@ -468,19 +462,16 @@
     ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
     DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
     PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
     dst += dst_stride;
 
     DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
     PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
     dst += dst_stride;
 
     DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
     PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
     dst += dst_stride;
 
@@ -519,12 +510,10 @@
 
     DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
     PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
 
     DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
     PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
 
     ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
@@ -531,12 +520,10 @@
     ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
     DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
     PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
 
     DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
     PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
 
     ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
@@ -543,12 +530,10 @@
     ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
     DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
     PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
 
     DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
     PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
 
     ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
@@ -555,12 +540,10 @@
     ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
     DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
     PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
 
     DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
     PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
     dst += (4 * dst_stride);
 
@@ -605,12 +588,10 @@
     ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
     DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
     PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
 
     DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
     PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
 
     ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
@@ -617,12 +598,10 @@
     ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
     DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
     SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
-    SAT_UH2_UH(tmp4, tmp5, 7);
     PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
 
     DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
     SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
-    SAT_UH2_UH(tmp6, tmp7, 7);
     PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
 
     ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
@@ -629,12 +608,10 @@
     ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
     DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
     PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
 
     DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
     PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
 
     ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
@@ -641,12 +618,10 @@
     ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
     DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
     SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
-    SAT_UH2_UH(tmp4, tmp5, 7);
     PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
 
     DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
     SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
-    SAT_UH2_UH(tmp6, tmp7, 7);
     PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
     dst += (2 * dst_stride);
 
--- a/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
@@ -318,7 +318,7 @@
                                  int8_t *filter) {
   v16i8 src0, src1, src2, src3, mask;
   v16u8 filt0, vec0, vec1, res0, res1;
-  v8u16 vec2, vec3, filt, const255;
+  v8u16 vec2, vec3, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[16]);
 
@@ -326,13 +326,10 @@
   filt = LD_UH(filter);
   filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
 
-  const255 = (v8u16) __msa_ldi_h(255);
-
   LD_SB4(src, src_stride, src0, src1, src2, src3);
   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
   DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
   SRARI_H2_UH(vec2, vec3, FILTER_BITS);
-  MIN_UH2_UH(vec2, vec3, const255);
   PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 }
@@ -343,7 +340,7 @@
   v16u8 vec0, vec1, vec2, vec3, filt0;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
   v16i8 res0, res1, res2, res3;
-  v8u16 vec4, vec5, vec6, vec7, filt, const255;
+  v8u16 vec4, vec5, vec6, vec7, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[16]);
 
@@ -351,8 +348,6 @@
   filt = LD_UH(filter);
   filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
 
-  const255 = (v8u16) __msa_ldi_h(255);
-
   LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
   VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
@@ -359,7 +354,6 @@
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
               vec6, vec7);
   SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
-  MIN_UH4_UH(vec4, vec5, vec6, vec7, const255);
   PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
               res2, res3);
   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
@@ -382,7 +376,7 @@
                                  int8_t *filter) {
   v16u8 filt0;
   v16i8 src0, src1, src2, src3, mask;
-  v8u16 vec0, vec1, vec2, vec3, const255, filt;
+  v8u16 vec0, vec1, vec2, vec3, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[0]);
 
@@ -390,8 +384,6 @@
   filt = LD_UH(filter);
   filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
 
-  const255 = (v8u16) __msa_ldi_h(255);
-
   LD_SB4(src, src_stride, src0, src1, src2, src3);
   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
   VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
@@ -398,7 +390,6 @@
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
               vec2, vec3);
   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
   PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
   ST8x4_UB(src0, src1, dst, dst_stride);
 }
@@ -408,7 +399,7 @@
                                      int8_t *filter, int32_t height) {
   v16u8 filt0;
   v16i8 src0, src1, src2, src3, mask, out0, out1;
-  v8u16 vec0, vec1, vec2, vec3, filt, const255;
+  v8u16 vec0, vec1, vec2, vec3, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[0]);
 
@@ -416,8 +407,6 @@
   filt = LD_UH(filter);
   filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
 
-  const255 = (v8u16) __msa_ldi_h(255);
-
   LD_SB4(src, src_stride, src0, src1, src2, src3);
   src += (4 * src_stride);
 
@@ -426,7 +415,6 @@
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
               vec2, vec3);
   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
 
   LD_SB4(src, src_stride, src0, src1, src2, src3);
   src += (4 * src_stride);
@@ -440,7 +428,6 @@
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
               vec2, vec3);
   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
   PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
   ST8x4_UB(out0, out1, dst, dst_stride);
   dst += (4 * dst_stride);
@@ -454,7 +441,6 @@
     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
                 vec2, vec3);
     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
     LD_SB4(src, src_stride, src0, src1, src2, src3);
     src += (4 * src_stride);
 
@@ -466,7 +452,6 @@
     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
                 vec2, vec3);
     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
     PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
     ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
   }
@@ -488,7 +473,7 @@
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
   v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[0]);
 
@@ -498,8 +483,6 @@
   filt = LD_UH(filter);
   filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
 
-  const255 = (v8u16) __msa_ldi_h(255);
-
   LD_SB4(src, src_stride, src0, src2, src4, src6);
   LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
   src += (4 * src_stride);
@@ -514,8 +497,6 @@
               out6, out7);
   SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
   SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-  MIN_UH4_UH(out0, out1, out2, out3, const255);
-  MIN_UH4_UH(out4, out5, out6, out7, const255);
   PCKEV_ST_SB(out0, out1, dst);
   dst += dst_stride;
   PCKEV_ST_SB(out2, out3, dst);
@@ -540,8 +521,6 @@
                 out6, out7);
     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-    MIN_UH4_UH(out0, out1, out2, out3, const255);
-    MIN_UH4_UH(out4, out5, out6, out7, const255);
     PCKEV_ST_SB(out0, out1, dst);
     dst += dst_stride;
     PCKEV_ST_SB(out2, out3, dst);
@@ -559,7 +538,7 @@
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
   v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[0]);
 
@@ -567,8 +546,6 @@
   filt = LD_UH(filter);
   filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
 
-  const255 = (v8u16) __msa_ldi_h(255);
-
   for (loop_cnt = height >> 1; loop_cnt--;) {
     src0 = LD_SB(src);
     src2 = LD_SB(src + 16);
@@ -591,8 +568,6 @@
                 out6, out7);
     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-    MIN_UH4_UH(out0, out1, out2, out3, const255);
-    MIN_UH4_UH(out4, out5, out6, out7, const255);
     PCKEV_ST_SB(out0, out1, dst);
     PCKEV_ST_SB(out2, out3, dst + 16);
     dst += dst_stride;
@@ -608,7 +583,7 @@
   uint32_t loop_cnt;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
   v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt, const255;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[0]);
 
@@ -616,8 +591,6 @@
   filt = LD_UH(filter);
   filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
 
-  const255 = (v8u16) __msa_ldi_h(255);
-
   for (loop_cnt = height; loop_cnt--;) {
     src0 = LD_SB(src);
     src2 = LD_SB(src + 16);
@@ -637,8 +610,6 @@
                 out6, out7);
     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-    MIN_UH4_UH(out0, out1, out2, out3, const255);
-    MIN_UH4_UH(out4, out5, out6, out7, const255);
     PCKEV_ST_SB(out0, out1, dst);
     PCKEV_ST_SB(out2, out3, dst + 16);
     PCKEV_ST_SB(out4, out5, dst + 32);
--- a/vpx_dsp/mips/vpx_convolve8_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_msa.c
@@ -256,7 +256,6 @@
   ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
   DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
   SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-  SAT_UH2_UH(tmp0, tmp1, 7);
   PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
 }
@@ -298,7 +297,6 @@
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
               vec4, vec5, vec6, vec7);
   SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
-  SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
   PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
               res2, res3);
   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
@@ -357,7 +355,6 @@
   tmp3 = __msa_dotp_u_h(vec3, filt_vt);
 
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
   PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
   ST8x4_UB(out0, out1, dst, dst_stride);
 }
@@ -402,7 +399,6 @@
     tmp2 = __msa_dotp_u_h(vec0, filt_vt);
 
     SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
-    SAT_UH2_UH(tmp1, tmp2, 7);
 
     hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
@@ -415,7 +411,6 @@
     tmp4 = __msa_dotp_u_h(vec0, filt_vt);
 
     SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
-    SAT_UH2_UH(tmp3, tmp4, 7);
     PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
     ST8x4_UB(out0, out1, dst, dst_stride);
     dst += (4 * dst_stride);
@@ -437,7 +432,6 @@
     tmp8 = __msa_dotp_u_h(vec0, filt_vt);
 
     SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
-    SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
     PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
     ST8x4_UB(out0, out1, dst, dst_stride);
     dst += (4 * dst_stride);
@@ -492,7 +486,6 @@
     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
     SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
-    SAT_UH2_UH(tmp1, tmp2, 7);
     PCKEV_ST_SB(tmp1, tmp2, dst);
     dst += dst_stride;
 
@@ -501,7 +494,6 @@
     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
     SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
-    SAT_UH2_UH(tmp1, tmp2, 7);
     PCKEV_ST_SB(tmp1, tmp2, dst);
     dst += dst_stride;
 
@@ -510,7 +502,6 @@
     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
     SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
-    SAT_UH2_UH(tmp1, tmp2, 7);
     PCKEV_ST_SB(tmp1, tmp2, dst);
     dst += dst_stride;
 
@@ -519,7 +510,6 @@
     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
     SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
-    SAT_UH2_UH(tmp1, tmp2, 7);
     PCKEV_ST_SB(tmp1, tmp2, dst);
     dst += dst_stride;
   }
--- a/vpx_dsp/mips/vpx_convolve8_vert_msa.c
+++ b/vpx_dsp/mips/vpx_convolve8_vert_msa.c
@@ -316,7 +316,6 @@
   ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
   DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
   SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-  SAT_UH2_UH(tmp0, tmp1, 7);
   src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
   ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
 }
@@ -349,7 +348,6 @@
   DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
               tmp0, tmp1, tmp2, tmp3);
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
   PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
   ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
   ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
@@ -383,7 +381,6 @@
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
               tmp2, tmp3);
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
   PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
   ST8x4_UB(out0, out1, dst, dst_stride);
 }
@@ -416,7 +413,6 @@
     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
                 tmp2, tmp3);
     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
     PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
     ST8x4_UB(out0, out1, dst, dst_stride);
     dst += (4 * dst_stride);
@@ -424,7 +420,6 @@
     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
                 tmp2, tmp3);
     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
     PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
     ST8x4_UB(out0, out1, dst, dst_stride);
     dst += (4 * dst_stride);
@@ -467,7 +462,6 @@
     ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
     DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
     PCKEV_ST_SB(tmp0, tmp1, dst);
     dst += dst_stride;
 
@@ -475,19 +469,16 @@
     ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
     DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
     PCKEV_ST_SB(tmp2, tmp3, dst);
     dst += dst_stride;
 
     DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
     PCKEV_ST_SB(tmp0, tmp1, dst);
     dst += dst_stride;
 
     DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
     PCKEV_ST_SB(tmp2, tmp3, dst);
     dst += dst_stride;
 
@@ -522,11 +513,9 @@
 
     DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
     PCKEV_ST_SB(tmp0, tmp1, dst);
     DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
     PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
 
     ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
@@ -533,12 +522,10 @@
     ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
     DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
     PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
 
     DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
     PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
 
     ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
@@ -545,12 +532,10 @@
     ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
     DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
     PCKEV_ST_SB(tmp0, tmp1, dst + 16);
 
     DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
     PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
 
     ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
@@ -557,12 +542,10 @@
     ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
     DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
     PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
 
     DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
     PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
     dst += (4 * dst_stride);
 
@@ -598,12 +581,10 @@
     ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
     DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
     PCKEV_ST_SB(tmp0, tmp1, dst);
 
     DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
     PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
 
     ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
@@ -610,12 +591,10 @@
     ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
     DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
     SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
-    SAT_UH2_UH(tmp4, tmp5, 7);
     PCKEV_ST_SB(tmp4, tmp5, dst + 16);
 
     DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
     SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
-    SAT_UH2_UH(tmp6, tmp7, 7);
     PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
 
     ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
@@ -622,12 +601,10 @@
     ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
     DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
     PCKEV_ST_SB(tmp0, tmp1, dst + 32);
 
     DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
     PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
 
     ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
@@ -634,12 +611,10 @@
     ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
     DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
     SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
-    SAT_UH2_UH(tmp4, tmp5, 7);
     PCKEV_ST_SB(tmp4, tmp5, dst + 48);
 
     DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
     SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
-    SAT_UH2_UH(tmp6, tmp7, 7);
     PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
     dst += (2 * dst_stride);