ref: 62830c53a644f5feaa49431b39c85093d2e387fc
parent: 272f46212ed96ee8c28056752c8da392229b8e17
author: chiyotsai <[email protected]>
date: Tue Oct 16 08:26:34 EDT 2018
Refactor SSE2 Code for 4-tap interpolation filter on width 16. Some repeated codes are refactored as inline functions. No performance degradation is observed. These inline functions can be used for width 8 and width 4. Change-Id: Ibf08cc9ebd2dd47bd2a6c2bcc1616f9d4c252d4d
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -89,6 +89,8 @@
DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h
DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c
+
+DSP_SRCS-$(HAVE_SSE2) += x86/convolve_sse2.h
DSP_SRCS-$(HAVE_SSSE3) += x86/convolve_ssse3.h
DSP_SRCS-$(HAVE_AVX2) += x86/convolve_avx2.h
DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_8t_sse2.asm
--- /dev/null
+++ b/vpx_dsp/x86/convolve_sse2.h
@@ -1,0 +1,67 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
+#define VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+
+#include "./vpx_config.h"
+
+// Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns
+// values at index 2 and 3 to return 3 2 3 2 3 2 3 2 as 16-bit words
+static INLINE __m128i extract_quarter_2_epi16_sse2(const __m128i *const reg) {
+ __m128i tmp = _mm_unpacklo_epi32(*reg, *reg);
+ return _mm_unpackhi_epi64(tmp, tmp);
+}
+
+// Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns
+// values at index 2 and 3 to return 5 4 5 4 5 4 5 4 as 16-bit words.
+static INLINE __m128i extract_quarter_3_epi16_sse2(const __m128i *const reg) {
+ __m128i tmp = _mm_unpackhi_epi32(*reg, *reg);
+ return _mm_unpacklo_epi64(tmp, tmp);
+}
+
+// Interprets src as 8-bit words, pads each word with zeroes to form 16-bit
+// words, then multiplies with ker and add the adjacent results to form 32-bit
+// words. Finally adds the result from 1 and 2 together.
+static INLINE __m128i pad_multiply_add_add_epi8_sse2(
+ const __m128i *const src_1, const __m128i *const src_2,
+ const __m128i *const ker_1, const __m128i *const ker_2) {
+ const __m128i src_1_half = _mm_unpacklo_epi8(*src_1, _mm_setzero_si128());
+ const __m128i src_2_half = _mm_unpacklo_epi8(*src_2, _mm_setzero_si128());
+ const __m128i madd_1 = _mm_madd_epi16(src_1_half, *ker_1);
+ const __m128i madd_2 = _mm_madd_epi16(src_2_half, *ker_2);
+ return _mm_add_epi32(madd_1, madd_2);
+}
+
+static INLINE __m128i multiply_add_packs_epi16_sse2(const __m128i *const src_0,
+ const __m128i *const src_1,
+ const __m128i *const ker) {
+ const __m128i madd_1 = _mm_madd_epi16(*src_0, *ker);
+ const __m128i madd_2 = _mm_madd_epi16(*src_1, *ker);
+ return _mm_packs_epi32(madd_1, madd_2);
+}
+static INLINE __m128i combine_epi32_sse2(const __m128i *const src_1,
+ const __m128i *const src_2) {
+ const __m128i tmp_1 = _mm_unpacklo_epi32(*src_1, *src_2);
+ const __m128i tmp_2 = _mm_unpackhi_epi32(*src_1, *src_2);
+ return _mm_packs_epi32(tmp_1, tmp_2);
+}
+
+static INLINE __m128i round_epi16_sse2(const __m128i *const src,
+ const __m128i *const half_depth,
+ const int depth) {
+ const __m128i nearest_src = _mm_adds_epi16(*src, *half_depth);
+ return _mm_srai_epi16(nearest_src, depth);
+}
+
+#endif // VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
--- a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
+++ b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
@@ -13,6 +13,7 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
#include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_sse2.h"
#include "vpx_ports/mem.h"
void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride,
@@ -26,8 +27,6 @@
__m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
__m128i dst_first, dst_second;
__m128i even, odd;
- __m128i tmp_1, tmp_2;
- __m128i madd_1, madd_2;
// Start one pixel before as we need tap/2 - 1 = 1 sample from the past
src_ptr -= 1;
@@ -35,10 +34,8 @@
// Load Kernel
kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
kernel_reg = _mm_srai_epi16(kernel_reg, 1);
- tmp_1 = _mm_unpacklo_epi32(kernel_reg, kernel_reg);
- kernel_reg_23 = _mm_unpackhi_epi64(tmp_1, tmp_1);
- tmp_2 = _mm_unpackhi_epi32(kernel_reg, kernel_reg);
- kernel_reg_45 = _mm_unpacklo_epi64(tmp_2, tmp_2);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
for (h = height; h > 0; --h) {
// We will load multiple shifted versions of the row and shuffle them into
@@ -57,23 +54,15 @@
src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
// Output 6 4 2 0
- tmp_1 = _mm_unpacklo_epi8(src_reg, _mm_setzero_si128());
- tmp_2 = _mm_unpacklo_epi8(src_reg_shift_2, _mm_setzero_si128());
- madd_1 = _mm_madd_epi16(tmp_1, kernel_reg_23);
- madd_2 = _mm_madd_epi16(tmp_2, kernel_reg_45);
- even = _mm_add_epi32(madd_1, madd_2);
+ even = pad_multiply_add_add_epi8_sse2(&src_reg, &src_reg_shift_2,
+ &kernel_reg_23, &kernel_reg_45);
// Output 7 5 3 1
- tmp_1 = _mm_unpacklo_epi8(src_reg_shift_1, _mm_setzero_si128());
- tmp_2 = _mm_unpacklo_epi8(src_reg_shift_3, _mm_setzero_si128());
- madd_1 = _mm_madd_epi16(tmp_1, kernel_reg_23);
- madd_2 = _mm_madd_epi16(tmp_2, kernel_reg_45);
- odd = _mm_add_epi32(madd_1, madd_2);
+ odd = pad_multiply_add_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
+ &kernel_reg_23, &kernel_reg_45);
// Combine to get the first half of the dst
- tmp_1 = _mm_unpacklo_epi32(even, odd);
- tmp_2 = _mm_unpackhi_epi32(even, odd);
- dst_first = _mm_packs_epi32(tmp_1, tmp_2);
+ dst_first = combine_epi32_sse2(&even, &odd);
// Do again to get the second half of dst
src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
@@ -82,29 +71,19 @@
src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
// Output 14 12 10 8
- tmp_1 = _mm_unpacklo_epi8(src_reg, _mm_setzero_si128());
- tmp_2 = _mm_unpacklo_epi8(src_reg_shift_2, _mm_setzero_si128());
- madd_1 = _mm_madd_epi16(tmp_1, kernel_reg_23);
- madd_2 = _mm_madd_epi16(tmp_2, kernel_reg_45);
- even = _mm_add_epi32(madd_1, madd_2);
+ even = pad_multiply_add_add_epi8_sse2(&src_reg, &src_reg_shift_2,
+ &kernel_reg_23, &kernel_reg_45);
// Output 15 13 11 9
- tmp_1 = _mm_unpacklo_epi8(src_reg_shift_1, _mm_setzero_si128());
- tmp_2 = _mm_unpacklo_epi8(src_reg_shift_3, _mm_setzero_si128());
- madd_1 = _mm_madd_epi16(tmp_1, kernel_reg_23);
- madd_2 = _mm_madd_epi16(tmp_2, kernel_reg_45);
- odd = _mm_add_epi32(madd_1, madd_2);
+ odd = pad_multiply_add_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
+ &kernel_reg_23, &kernel_reg_45);
// Combine to get the second half of the dst
- tmp_1 = _mm_unpacklo_epi32(even, odd);
- tmp_2 = _mm_unpackhi_epi32(even, odd);
- dst_second = _mm_packs_epi32(tmp_1, tmp_2);
+ dst_second = combine_epi32_sse2(&even, &odd);
// Round each result
- dst_first = _mm_adds_epi16(dst_first, reg_32);
- dst_first = _mm_srai_epi16(dst_first, 6);
- dst_second = _mm_adds_epi16(dst_second, reg_32);
- dst_second = _mm_srai_epi16(dst_second, 6);
+ dst_first = round_epi16_sse2(&dst_first, ®_32, 6);
+ dst_second = round_epi16_sse2(&dst_second, ®_32, 6);
// Finally combine to get the final dst
dst_first = _mm_packus_epi16(dst_first, dst_second);
@@ -143,7 +122,6 @@
__m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi;
const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding
- __m128i tmp_0, tmp_1;
// We will compute the result two rows at a time
const ptrdiff_t src_stride_unrolled = src_stride << 1;
@@ -157,13 +135,12 @@
// Load Kernel
kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
kernel_reg = _mm_srai_epi16(kernel_reg, 1);
- tmp_0 = _mm_unpacklo_epi32(kernel_reg, kernel_reg);
- kernel_reg_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- tmp_1 = _mm_unpackhi_epi32(kernel_reg, kernel_reg);
- kernel_reg_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+ kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
// We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
- // words, shuffle the data into the form
+ // words,
+ // shuffle the data into the form
// ... s[0,1] s[-1,1] s[0,0] s[-1,0]
// ... s[0,7] s[-1,7] s[0,6] s[-1,6]
// ... s[0,9] s[-1,9] s[0,8] s[-1,8]
@@ -204,25 +181,21 @@
src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3);
// Partial output from first half
- tmp_0 = _mm_madd_epi16(src_reg_m10_lo_1, kernel_reg_23);
- tmp_1 = _mm_madd_epi16(src_reg_m10_lo_2, kernel_reg_23);
- res_reg_m10_lo = _mm_packs_epi32(tmp_0, tmp_1);
+ res_reg_m10_lo = multiply_add_packs_epi16_sse2(
+ &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23);
- tmp_0 = _mm_madd_epi16(src_reg_01_lo_1, kernel_reg_23);
- tmp_1 = _mm_madd_epi16(src_reg_01_lo_2, kernel_reg_23);
- res_reg_01_lo = _mm_packs_epi32(tmp_0, tmp_1);
+ res_reg_01_lo = multiply_add_packs_epi16_sse2(
+ &src_reg_01_lo_1, &src_reg_01_lo_2, &kernel_reg_23);
src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128());
- tmp_0 = _mm_madd_epi16(src_reg_12_lo_1, kernel_reg_45);
- tmp_1 = _mm_madd_epi16(src_reg_12_lo_2, kernel_reg_45);
- res_reg_12_lo = _mm_packs_epi32(tmp_0, tmp_1);
+ res_reg_12_lo = multiply_add_packs_epi16_sse2(
+ &src_reg_12_lo_1, &src_reg_12_lo_2, &kernel_reg_45);
src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128());
- tmp_0 = _mm_madd_epi16(src_reg_23_lo_1, kernel_reg_45);
- tmp_1 = _mm_madd_epi16(src_reg_23_lo_2, kernel_reg_45);
- res_reg_23_lo = _mm_packs_epi32(tmp_0, tmp_1);
+ res_reg_23_lo = multiply_add_packs_epi16_sse2(
+ &src_reg_23_lo_1, &src_reg_23_lo_2, &kernel_reg_45);
// Add to get first half of the results
res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
@@ -230,39 +203,31 @@
// Now repeat everything again for the second half
// Partial output for second half
- tmp_0 = _mm_madd_epi16(src_reg_m10_hi_1, kernel_reg_23);
- tmp_1 = _mm_madd_epi16(src_reg_m10_hi_2, kernel_reg_23);
- res_reg_m10_hi = _mm_packs_epi32(tmp_0, tmp_1);
+ res_reg_m10_hi = multiply_add_packs_epi16_sse2(
+ &src_reg_m10_hi_1, &src_reg_m10_hi_2, &kernel_reg_23);
- tmp_0 = _mm_madd_epi16(src_reg_01_hi_1, kernel_reg_23);
- tmp_1 = _mm_madd_epi16(src_reg_01_hi_2, kernel_reg_23);
- res_reg_01_hi = _mm_packs_epi32(tmp_0, tmp_1);
+ res_reg_01_hi = multiply_add_packs_epi16_sse2(
+ &src_reg_01_hi_1, &src_reg_01_hi_2, &kernel_reg_23);
src_reg_12_hi_1 = _mm_unpacklo_epi8(src_reg_12_hi, _mm_setzero_si128());
src_reg_12_hi_2 = _mm_unpackhi_epi8(src_reg_12_hi, _mm_setzero_si128());
- tmp_0 = _mm_madd_epi16(src_reg_12_hi_1, kernel_reg_45);
- tmp_1 = _mm_madd_epi16(src_reg_12_hi_2, kernel_reg_45);
- res_reg_12_hi = _mm_packs_epi32(tmp_0, tmp_1);
+ res_reg_12_hi = multiply_add_packs_epi16_sse2(
+ &src_reg_12_hi_1, &src_reg_12_hi_2, &kernel_reg_45);
src_reg_23_hi_1 = _mm_unpacklo_epi8(src_reg_23_hi, _mm_setzero_si128());
src_reg_23_hi_2 = _mm_unpackhi_epi8(src_reg_23_hi, _mm_setzero_si128());
- tmp_0 = _mm_madd_epi16(src_reg_23_hi_1, kernel_reg_45);
- tmp_1 = _mm_madd_epi16(src_reg_23_hi_2, kernel_reg_45);
- res_reg_23_hi = _mm_packs_epi32(tmp_0, tmp_1);
+ res_reg_23_hi = multiply_add_packs_epi16_sse2(
+ &src_reg_23_hi_1, &src_reg_23_hi_2, &kernel_reg_45);
- // First half of the results
+ // Second half of the results
res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi);
res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi);
// Round the words
- res_reg_m1012_lo = _mm_adds_epi16(res_reg_m1012_lo, reg_32);
- res_reg_0123_lo = _mm_adds_epi16(res_reg_0123_lo, reg_32);
- res_reg_m1012_hi = _mm_adds_epi16(res_reg_m1012_hi, reg_32);
- res_reg_0123_hi = _mm_adds_epi16(res_reg_0123_hi, reg_32);
- res_reg_m1012_lo = _mm_srai_epi16(res_reg_m1012_lo, 6);
- res_reg_0123_lo = _mm_srai_epi16(res_reg_0123_lo, 6);
- res_reg_m1012_hi = _mm_srai_epi16(res_reg_m1012_hi, 6);
- res_reg_0123_hi = _mm_srai_epi16(res_reg_0123_hi, 6);
+ res_reg_m1012_lo = round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6);
+ res_reg_0123_lo = round_epi16_sse2(&res_reg_0123_lo, ®_32, 6);
+ res_reg_m1012_hi = round_epi16_sse2(&res_reg_m1012_hi, ®_32, 6);
+ res_reg_0123_hi = round_epi16_sse2(&res_reg_0123_hi, ®_32, 6);
// Combine to get the result
res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi);