ref: ad0ed535a702ebe4164c0bf5fd5f3211269f1fad
parent: f2c039115d8c63e19a8f5fab765360f0be672d41
author: Johann <[email protected]>
date: Wed Oct 24 11:48:32 EDT 2018
vp8 bilinear: rewrite 16x16 Marginally faster. Most importantly it drops a dependency on an external symbol (vp8_bilinear_filters_x86_8). Change-Id: Iff022e718720f1f0eeced6201a1ad69a9c9c4f45
--- a/vp8/common/x86/bilinear_filter_sse2.c
+++ b/vp8/common/x86/bilinear_filter_sse2.c
@@ -12,8 +12,132 @@
#include <xmmintrin.h>
#include "./vp8_rtcd.h"
+#include "./vpx_config.h"
#include "vp8/common/filter.h"
#include "vpx_ports/mem.h"
+
+static INLINE void horizontal_16x16(uint8_t *src, const int stride,
+ uint16_t *dst, const int xoffset) {
+ int h;
+ const __m128i zero = _mm_setzero_si128();
+
+ if (xoffset == 0) {
+ for (h = 0; h < 17; ++h) {
+ const __m128i a = _mm_loadu_si128((__m128i *)src);
+ const __m128i a_lo = _mm_unpacklo_epi8(a, zero);
+ const __m128i a_hi = _mm_unpackhi_epi8(a, zero);
+ _mm_store_si128((__m128i *)dst, a_lo);
+ _mm_store_si128((__m128i *)(dst + 8), a_hi);
+ src += stride;
+ dst += 16;
+ }
+ return;
+ }
+
+ {
+ const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+ const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
+ const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
+
+ for (h = 0; h < 17; ++h) {
+ const __m128i a = _mm_loadu_si128((__m128i *)src);
+ const __m128i a_lo = _mm_unpacklo_epi8(a, zero);
+ const __m128i a_hi = _mm_unpackhi_epi8(a, zero);
+ const __m128i a_lo_filtered = _mm_mullo_epi16(a_lo, hfilter_0);
+ const __m128i a_hi_filtered = _mm_mullo_epi16(a_hi, hfilter_0);
+
+ const __m128i b = _mm_loadu_si128((__m128i *)(src + 1));
+ const __m128i b_lo = _mm_unpacklo_epi8(b, zero);
+ const __m128i b_hi = _mm_unpackhi_epi8(b, zero);
+ const __m128i b_lo_filtered = _mm_mullo_epi16(b_lo, hfilter_1);
+ const __m128i b_hi_filtered = _mm_mullo_epi16(b_hi, hfilter_1);
+
+ const __m128i sum_lo = _mm_add_epi16(a_lo_filtered, b_lo_filtered);
+ const __m128i sum_hi = _mm_add_epi16(a_hi_filtered, b_hi_filtered);
+
+ const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor);
+ const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor);
+
+ const __m128i shifted_lo =
+ _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT);
+ const __m128i shifted_hi =
+ _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT);
+
+ _mm_store_si128((__m128i *)dst, shifted_lo);
+ _mm_store_si128((__m128i *)(dst + 8), shifted_hi);
+ src += stride;
+ dst += 16;
+ }
+ }
+}
+
+static INLINE void vertical_16x16(uint16_t *src, uint8_t *dst, const int stride,
+ const int yoffset) {
+ int h;
+
+ if (yoffset == 0) {
+ for (h = 0; h < 16; ++h) {
+ const __m128i row_lo = _mm_load_si128((__m128i *)src);
+ const __m128i row_hi = _mm_load_si128((__m128i *)(src + 8));
+ const __m128i packed = _mm_packus_epi16(row_lo, row_hi);
+ _mm_store_si128((__m128i *)dst, packed);
+ src += 16;
+ dst += stride;
+ }
+ return;
+ }
+
+ {
+ const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+ const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
+ const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
+
+ __m128i row_0_lo = _mm_load_si128((__m128i *)src);
+ __m128i row_0_hi = _mm_load_si128((__m128i *)(src + 8));
+ src += 16;
+ for (h = 0; h < 16; ++h) {
+ const __m128i row_0_lo_filtered = _mm_mullo_epi16(row_0_lo, vfilter_0);
+ const __m128i row_0_hi_filtered = _mm_mullo_epi16(row_0_hi, vfilter_0);
+
+ const __m128i row_1_lo = _mm_load_si128((__m128i *)src);
+ const __m128i row_1_hi = _mm_load_si128((__m128i *)(src + 8));
+ const __m128i row_1_lo_filtered = _mm_mullo_epi16(row_1_lo, vfilter_1);
+ const __m128i row_1_hi_filtered = _mm_mullo_epi16(row_1_hi, vfilter_1);
+
+ const __m128i sum_lo =
+ _mm_add_epi16(row_0_lo_filtered, row_1_lo_filtered);
+ const __m128i sum_hi =
+ _mm_add_epi16(row_0_hi_filtered, row_1_hi_filtered);
+
+ const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor);
+ const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor);
+
+ const __m128i shifted_lo =
+ _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT);
+ const __m128i shifted_hi =
+ _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT);
+
+ const __m128i packed = _mm_packus_epi16(shifted_lo, shifted_hi);
+ _mm_store_si128((__m128i *)dst, packed);
+ row_0_lo = row_1_lo;
+ row_0_hi = row_1_hi;
+ src += 16;
+ dst += stride;
+ }
+ }
+}
+
+void vp8_bilinear_predict16x16_sse2(uint8_t *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset, uint8_t *dst_ptr,
+ int dst_pitch) {
+ uint16_t FData[16 * 17];
+
+ assert((xoffset | yoffset) != 0);
+
+ horizontal_16x16(src_ptr, src_pixels_per_line, FData, xoffset);
+
+ vertical_16x16(FData, dst_ptr, dst_pitch, yoffset);
+}
static INLINE void horizontal_8xN(uint8_t *src, const int stride, uint16_t *dst,
const int xoffset, const int height) {
--- a/vp8/common/x86/subpixel_sse2.asm
+++ b/vp8/common/x86/subpixel_sse2.asm
@@ -10,7 +10,6 @@
%include "vpx_ports/x86_abi_support.asm"
-extern sym(vp8_bilinear_filters_x86_8)
%define BLOCK_HEIGHT_WIDTH 4
%define VP8_FILTER_WEIGHT 128
@@ -957,274 +956,6 @@
pop rbp
ret
-
-;void vp8_bilinear_predict16x16_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-extern sym(vp8_bilinear_filters_x86_8)
-global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
-sym(vp8_bilinear_predict16x16_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
- ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
-
- lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
- movsxd rax, dword ptr arg(2) ;xoffset
-
- cmp rax, 0 ;skip first_pass filter if xoffset=0
- je .b16x16_sp_only
-
- shl rax, 5
- add rax, rcx ;HFilter
-
- mov rdi, arg(4) ;dst_ptr
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, dword ptr arg(5) ;dst_pitch
-
- movdqa xmm1, [rax]
- movdqa xmm2, [rax+16]
-
- movsxd rax, dword ptr arg(3) ;yoffset
-
- cmp rax, 0 ;skip second_pass filter if yoffset=0
- je .b16x16_fp_only
-
- shl rax, 5
- add rax, rcx ;VFilter
-
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
-
- pxor xmm0, xmm0
-
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(5) ;dst_pitch
-%endif
- ; get the first horizontal line done
- movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movdqa xmm4, xmm3 ; make a copy of current line
-
- punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
- punpckhbw xmm4, xmm0
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm1
-
- movdqu xmm5, [rsi+1]
- movdqa xmm6, xmm5
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
-
- pmullw xmm5, xmm2
- pmullw xmm6, xmm2
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP8_FILTER_SHIFT
-
- movdqa xmm7, xmm3
- packuswb xmm7, xmm4
-
- add rsi, rdx ; next line
-.next_row:
- movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movdqa xmm4, xmm3 ; make a copy of current line
-
- punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
- punpckhbw xmm4, xmm0
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm1
-
- movdqu xmm5, [rsi+1]
- movdqa xmm6, xmm5
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
-
- pmullw xmm5, xmm2
- pmullw xmm6, xmm2
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- movdqa xmm5, xmm7
- movdqa xmm6, xmm7
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
-
- pmullw xmm5, [rax]
- pmullw xmm6, [rax]
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP8_FILTER_SHIFT
-
- movdqa xmm7, xmm3
- packuswb xmm7, xmm4
-
- pmullw xmm3, [rax+16]
- pmullw xmm4, [rax+16]
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP8_FILTER_SHIFT
-
- packuswb xmm3, xmm4
- movdqa [rdi], xmm3 ; store the results in the destination
-
- add rsi, rdx ; next line
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(5) ;dst_pitch
-%else
- add rdi, r8
-%endif
-
- cmp rdi, rcx
- jne .next_row
-
- jmp .done
-
-.b16x16_sp_only:
- movsxd rax, dword ptr arg(3) ;yoffset
- shl rax, 5
- add rax, rcx ;VFilter
-
- mov rdi, arg(4) ;dst_ptr
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, dword ptr arg(5) ;dst_pitch
-
- movdqa xmm1, [rax]
- movdqa xmm2, [rax+16]
-
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
-
- pxor xmm0, xmm0
-
- ; get the first horizontal line done
- movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
- add rsi, rax ; next line
-.next_row_spo:
- movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
- movdqa xmm5, xmm7
- movdqa xmm6, xmm7
-
- movdqa xmm4, xmm3 ; make a copy of current line
- movdqa xmm7, xmm3
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
- punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
- punpckhbw xmm4, xmm0
-
- pmullw xmm5, xmm1
- pmullw xmm6, xmm1
- pmullw xmm3, xmm2
- pmullw xmm4, xmm2
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP8_FILTER_SHIFT
-
- packuswb xmm3, xmm4
- movdqa [rdi], xmm3 ; store the results in the destination
-
- add rsi, rax ; next line
- add rdi, rdx ;dst_pitch
- cmp rdi, rcx
- jne .next_row_spo
-
- jmp .done
-
-.b16x16_fp_only:
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- pxor xmm0, xmm0
-
-.next_row_fpo:
- movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movdqa xmm4, xmm3 ; make a copy of current line
-
- punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
- punpckhbw xmm4, xmm0
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm1
-
- movdqu xmm5, [rsi+1]
- movdqa xmm6, xmm5
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
-
- pmullw xmm5, xmm2
- pmullw xmm6, xmm2
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP8_FILTER_SHIFT
-
- packuswb xmm3, xmm4
- movdqa [rdi], xmm3 ; store the results in the destination
-
- add rsi, rax ; next line
- add rdi, rdx ; dst_pitch
- cmp rdi, rcx
- jne .next_row_fpo
-
-.done:
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
SECTION_RODATA
align 16