ref: 49dc7b05d0e026c945c8ee11266979b1e4830fbb
parent: 5887aededbd96c07101d9ca2ddc9e6c33b34df15
parent: 13284311eb5fe4c6bce739e2f721854dbd3092ea
author: Frank Galligan <[email protected]>
date: Thu Sep 18 11:10:16 EDT 2014
Merge "FIX: vp9_loopfilter_intrin_sse2.c"
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -351,27 +351,34 @@
}
}
-static INLINE __m128i filter_add2_sub2(__m128i total, __m128i a1, __m128i a2,
- __m128i s1, __m128i s2) {
- total = _mm_add_epi16(a1, total);
- total = _mm_add_epi16(_mm_sub_epi16(total, _mm_add_epi16(s1, s2)), a2);
- return total;
+static INLINE __m128i filter_add2_sub2(const __m128i *const total,
+ const __m128i *const a1,
+ const __m128i *const a2,
+ const __m128i *const s1,
+ const __m128i *const s2) {
+ __m128i x = _mm_add_epi16(*a1, *total);
+ x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
+ return x;
}
-static INLINE __m128i filter8_mask(__m128i flat, __m128i other_filt,
- __m128i f8_lo, __m128i f8_hi) {
- const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(f8_lo, 3),
- _mm_srli_epi16(f8_hi, 3));
- const __m128i result = _mm_and_si128(flat, f8);
- return _mm_or_si128(_mm_andnot_si128(flat, other_filt), result);
+static INLINE __m128i filter8_mask(const __m128i *const flat,
+ const __m128i *const other_filt,
+ const __m128i *const f8_lo,
+ const __m128i *const f8_hi) {
+ const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3),
+ _mm_srli_epi16(*f8_hi, 3));
+ const __m128i result = _mm_and_si128(*flat, f8);
+ return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
}
-static INLINE __m128i filter16_mask(__m128i flat, __m128i other_filt,
- __m128i f_lo, __m128i f_hi) {
- const __m128i f = _mm_packus_epi16(_mm_srli_epi16(f_lo, 4),
- _mm_srli_epi16(f_hi, 4));
- const __m128i result = _mm_and_si128(flat, f);
- return _mm_or_si128(_mm_andnot_si128(flat, other_filt), result);
+static INLINE __m128i filter16_mask(const __m128i *const flat,
+ const __m128i *const other_filt,
+ const __m128i *const f_lo,
+ const __m128i *const f_hi) {
+ const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4),
+ _mm_srli_epi16(*f_hi, 4));
+ const __m128i result = _mm_and_si128(*flat, f);
+ return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
}
static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
@@ -554,27 +561,27 @@
_mm_add_epi16(p2_hi, p1_hi));
f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
- op2 = filter8_mask(flat, p2, f8_lo, f8_hi);
+ op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
- f8_lo = filter_add2_sub2(f8_lo, q1_lo, p1_lo, p2_lo, p3_lo);
- f8_hi = filter_add2_sub2(f8_hi, q1_hi, p1_hi, p2_hi, p3_hi);
- op1 = filter8_mask(flat, op1, f8_lo, f8_hi);
+ f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
+ op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
- f8_lo = filter_add2_sub2(f8_lo, q2_lo, p0_lo, p1_lo, p3_lo);
- f8_hi = filter_add2_sub2(f8_hi, q2_hi, p0_hi, p1_hi, p3_hi);
- op0 = filter8_mask(flat, op0, f8_lo, f8_hi);
+ f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
+ op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
- f8_lo = filter_add2_sub2(f8_lo, q3_lo, q0_lo, p0_lo, p3_lo);
- f8_hi = filter_add2_sub2(f8_hi, q3_hi, q0_hi, p0_hi, p3_hi);
- oq0 = filter8_mask(flat, oq0, f8_lo, f8_hi);
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
+ oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
- f8_lo = filter_add2_sub2(f8_lo, q3_lo, q1_lo, q0_lo, p2_lo);
- f8_hi = filter_add2_sub2(f8_hi, q3_hi, q1_hi, q0_hi, p2_hi);
- oq1 = filter8_mask(flat, oq1, f8_lo, f8_hi);
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
+ oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
- f8_lo = filter_add2_sub2(f8_lo, q3_lo, q2_lo, q1_lo, p1_lo);
- f8_hi = filter_add2_sub2(f8_hi, q3_hi, q2_hi, q1_hi, p1_hi);
- oq2 = filter8_mask(flat, q2, f8_lo, f8_hi);
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
+ oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
}
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -634,72 +641,72 @@
f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
- p6 = filter16_mask(flat2, p6, f_lo, f_hi);
+ p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
_mm_storeu_si128((__m128i *)(s - 7 * p), p6);
- f_lo = filter_add2_sub2(f_lo, q1_lo, p5_lo, p6_lo, p7_lo);
- f_hi = filter_add2_sub2(f_hi, q1_hi, p5_hi, p6_hi, p7_hi);
- p5 = filter16_mask(flat2, p5, f_lo, f_hi);
+ f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
+ p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
_mm_storeu_si128((__m128i *)(s - 6 * p), p5);
- f_lo = filter_add2_sub2(f_lo, q2_lo, p4_lo, p5_lo, p7_lo);
- f_hi = filter_add2_sub2(f_hi, q2_hi, p4_hi, p5_hi, p7_hi);
- p4 = filter16_mask(flat2, p4, f_lo, f_hi);
+ f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
+ p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
_mm_storeu_si128((__m128i *)(s - 5 * p), p4);
- f_lo = filter_add2_sub2(f_lo, q3_lo, p3_lo, p4_lo, p7_lo);
- f_hi = filter_add2_sub2(f_hi, q3_hi, p3_hi, p4_hi, p7_hi);
- p3 = filter16_mask(flat2, p3, f_lo, f_hi);
+ f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
+ p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
_mm_storeu_si128((__m128i *)(s - 4 * p), p3);
- f_lo = filter_add2_sub2(f_lo, q4_lo, p2_lo, p3_lo, p7_lo);
- f_hi = filter_add2_sub2(f_hi, q4_hi, p2_hi, p3_hi, p7_hi);
- op2 = filter16_mask(flat2, op2, f_lo, f_hi);
+ f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
+ op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
_mm_storeu_si128((__m128i *)(s - 3 * p), op2);
- f_lo = filter_add2_sub2(f_lo, q5_lo, p1_lo, p2_lo, p7_lo);
- f_hi = filter_add2_sub2(f_hi, q5_hi, p1_hi, p2_hi, p7_hi);
- op1 = filter16_mask(flat2, op1, f_lo, f_hi);
+ f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
+ op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
_mm_storeu_si128((__m128i *)(s - 2 * p), op1);
- f_lo = filter_add2_sub2(f_lo, q6_lo, p0_lo, p1_lo, p7_lo);
- f_hi = filter_add2_sub2(f_hi, q6_hi, p0_hi, p1_hi, p7_hi);
- op0 = filter16_mask(flat2, op0, f_lo, f_hi);
+ f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
+ op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
_mm_storeu_si128((__m128i *)(s - 1 * p), op0);
- f_lo = filter_add2_sub2(f_lo, q7_lo, q0_lo, p0_lo, p7_lo);
- f_hi = filter_add2_sub2(f_hi, q7_hi, q0_hi, p0_hi, p7_hi);
- oq0 = filter16_mask(flat2, oq0, f_lo, f_hi);
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
+ oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
_mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
- f_lo = filter_add2_sub2(f_lo, q7_lo, q1_lo, p6_lo, q0_lo);
- f_hi = filter_add2_sub2(f_hi, q7_hi, q1_hi, p6_hi, q0_hi);
- oq1 = filter16_mask(flat2, oq1, f_lo, f_hi);
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
+ oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
_mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
- f_lo = filter_add2_sub2(f_lo, q7_lo, q2_lo, p5_lo, q1_lo);
- f_hi = filter_add2_sub2(f_hi, q7_hi, q2_hi, p5_hi, q1_hi);
- oq2 = filter16_mask(flat2, oq2, f_lo, f_hi);
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
+ oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
_mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
- f_lo = filter_add2_sub2(f_lo, q7_lo, q3_lo, p4_lo, q2_lo);
- f_hi = filter_add2_sub2(f_hi, q7_hi, q3_hi, p4_hi, q2_hi);
- q3 = filter16_mask(flat2, q3, f_lo, f_hi);
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
+ q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
_mm_storeu_si128((__m128i *)(s + 3 * p), q3);
- f_lo = filter_add2_sub2(f_lo, q7_lo, q4_lo, p3_lo, q3_lo);
- f_hi = filter_add2_sub2(f_hi, q7_hi, q4_hi, p3_hi, q3_hi);
- q4 = filter16_mask(flat2, q4, f_lo, f_hi);
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
+ q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
_mm_storeu_si128((__m128i *)(s + 4 * p), q4);
- f_lo = filter_add2_sub2(f_lo, q7_lo, q5_lo, p2_lo, q4_lo);
- f_hi = filter_add2_sub2(f_hi, q7_hi, q5_hi, p2_hi, q4_hi);
- q5 = filter16_mask(flat2, q5, f_lo, f_hi);
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
+ q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
_mm_storeu_si128((__m128i *)(s + 5 * p), q5);
- f_lo = filter_add2_sub2(f_lo, q7_lo, q6_lo, p1_lo, q5_lo);
- f_hi = filter_add2_sub2(f_hi, q7_hi, q6_hi, p1_hi, q5_hi);
- q6 = filter16_mask(flat2, q6, f_lo, f_hi);
+ f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
+ q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
_mm_storeu_si128((__m128i *)(s + 6 * p), q6);
}
// wide flat