ref: 0c00af126dcda6fddcc683cc15c1be06b3285054
parent: 7e77938d7200a5a55490130e156a918a241100bd
author: Alex Converse <[email protected]>
date: Tue Oct 6 11:59:03 EDT 2015
Add vpx_highbd_convolve_{copy,avg}_sse2 single-threaded: swanky (silvermont): ~1% faster overall peppy (celeron,haswell): ~1.5% faster overall Change-Id: Ib74f014374c63c9eaf2d38191cbd8e2edcc52073
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -973,6 +973,14 @@
w, h, bd); \
}
#if HAVE_SSE2 && ARCH_X86_64
+#if CONFIG_USE_X86INC
+WRAP(convolve_copy_sse2, 8)
+WRAP(convolve_avg_sse2, 8)
+WRAP(convolve_copy_sse2, 10)
+WRAP(convolve_avg_sse2, 10)
+WRAP(convolve_copy_sse2, 12)
+WRAP(convolve_avg_sse2, 12)
+#endif // CONFIG_USE_X86INC
WRAP(convolve8_horiz_sse2, 8)
WRAP(convolve8_avg_horiz_sse2, 8)
WRAP(convolve8_vert_sse2, 8)
@@ -1116,7 +1124,11 @@
#if HAVE_SSE2 && ARCH_X86_64
#if CONFIG_VP9_HIGHBITDEPTH
const ConvolveFunctions convolve8_sse2(
+#if CONFIG_USE_X86INC
+ wrap_convolve_copy_sse2_8, wrap_convolve_avg_sse2_8,
+#else
wrap_convolve_copy_c_8, wrap_convolve_avg_c_8,
+#endif // CONFIG_USE_X86INC
wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8,
wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8,
@@ -1124,7 +1136,11 @@
wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8, 8);
const ConvolveFunctions convolve10_sse2(
+#if CONFIG_USE_X86INC
+ wrap_convolve_copy_sse2_10, wrap_convolve_avg_sse2_10,
+#else
wrap_convolve_copy_c_10, wrap_convolve_avg_c_10,
+#endif // CONFIG_USE_X86INC
wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10,
wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10,
@@ -1132,7 +1148,11 @@
wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10, 10);
const ConvolveFunctions convolve12_sse2(
+#if CONFIG_USE_X86INC
+ wrap_convolve_copy_sse2_12, wrap_convolve_avg_sse2_12,
+#else
wrap_convolve_copy_c_12, wrap_convolve_avg_c_12,
+#endif // CONFIG_USE_X86INC
wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12,
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -434,10 +434,10 @@
# Sub Pixel Filters
#
add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
- specialize qw/vpx_highbd_convolve_copy/;
+ specialize qw/vpx_highbd_convolve_copy/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
- specialize qw/vpx_highbd_convolve_avg/;
+ specialize qw/vpx_highbd_convolve_avg/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/vpx_highbd_convolve8/, "$sse2_x86_64";
--- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -12,13 +12,28 @@
SECTION .text
-%macro convolve_fn 1
+%macro convolve_fn 1-2
INIT_XMM sse2
+%ifidn %2, highbd
+%define pavg pavgw
+cglobal %2_convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
+ fx, fxs, fy, fys, w, h, bd
+%else
+%define pavg pavgb
cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
fx, fxs, fy, fys, w, h
+%endif
mov r4d, dword wm
+%ifidn %2, highbd
+ shl r4d, 1
+ shl srcq, 1
+ shl src_strideq, 1
+ shl dstq, 1
+ shl dst_strideq, 1
+%else
cmp r4d, 4
je .w4
+%endif
cmp r4d, 8
je .w8
cmp r4d, 16
@@ -25,8 +40,49 @@
je .w16
cmp r4d, 32
je .w32
+%ifidn %2, highbd
+ cmp r4d, 64
+ je .w64
mov r4d, dword hm
+.loop128:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+32]
+ movu m3, [srcq+48]
+%ifidn %1, avg
+ pavg m0, [dstq]
+ pavg m1, [dstq+16]
+ pavg m2, [dstq+32]
+ pavg m3, [dstq+48]
+%endif
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ movu m0, [srcq+64]
+ movu m1, [srcq+80]
+ movu m2, [srcq+96]
+ movu m3, [srcq+112]
+ add srcq, src_strideq
+%ifidn %1, avg
+ pavg m0, [dstq+64]
+ pavg m1, [dstq+80]
+ pavg m2, [dstq+96]
+ pavg m3, [dstq+112]
+%endif
+ mova [dstq+64], m0
+ mova [dstq+80], m1
+ mova [dstq+96], m2
+ mova [dstq+112], m3
+ add dstq, dst_strideq
+ dec r4d
+ jnz .loop128
+ RET
+%endif
+
+.w64
+ mov r4d, dword hm
.loop64:
movu m0, [srcq]
movu m1, [srcq+16]
@@ -34,10 +90,10 @@
movu m3, [srcq+48]
add srcq, src_strideq
%ifidn %1, avg
- pavgb m0, [dstq]
- pavgb m1, [dstq+16]
- pavgb m2, [dstq+32]
- pavgb m3, [dstq+48]
+ pavg m0, [dstq]
+ pavg m1, [dstq+16]
+ pavg m2, [dstq+32]
+ pavg m3, [dstq+48]
%endif
mova [dstq ], m0
mova [dstq+16], m1
@@ -57,10 +113,10 @@
movu m3, [srcq+src_strideq+16]
lea srcq, [srcq+src_strideq*2]
%ifidn %1, avg
- pavgb m0, [dstq]
- pavgb m1, [dstq +16]
- pavgb m2, [dstq+dst_strideq]
- pavgb m3, [dstq+dst_strideq+16]
+ pavg m0, [dstq]
+ pavg m1, [dstq +16]
+ pavg m2, [dstq+dst_strideq]
+ pavg m3, [dstq+dst_strideq+16]
%endif
mova [dstq ], m0
mova [dstq +16], m1
@@ -82,10 +138,10 @@
movu m3, [srcq+r5q]
lea srcq, [srcq+src_strideq*4]
%ifidn %1, avg
- pavgb m0, [dstq]
- pavgb m1, [dstq+dst_strideq]
- pavgb m2, [dstq+dst_strideq*2]
- pavgb m3, [dstq+r6q]
+ pavg m0, [dstq]
+ pavg m1, [dstq+dst_strideq]
+ pavg m2, [dstq+dst_strideq*2]
+ pavg m3, [dstq+r6q]
%endif
mova [dstq ], m0
mova [dstq+dst_strideq ], m1
@@ -108,10 +164,10 @@
movu m3, [srcq+r5q]
lea srcq, [srcq+src_strideq*4]
%ifidn %1, avg
- pavgb m0, [dstq]
- pavgb m1, [dstq+dst_strideq]
- pavgb m2, [dstq+dst_strideq*2]
- pavgb m3, [dstq+r6q]
+ pavg m0, [dstq]
+ pavg m1, [dstq+dst_strideq]
+ pavg m2, [dstq+dst_strideq*2]
+ pavg m3, [dstq+r6q]
%endif
mova [dstq ], m0
mova [dstq+dst_strideq ], m1
@@ -122,6 +178,7 @@
jnz .loop8
RET
+%ifnidn %2, highbd
.w4:
mov r4d, dword hm
lea r5q, [src_strideq*3]
@@ -137,10 +194,10 @@
movh m5, [dstq+dst_strideq]
movh m6, [dstq+dst_strideq*2]
movh m7, [dstq+r6q]
- pavgb m0, m4
- pavgb m1, m5
- pavgb m2, m6
- pavgb m3, m7
+ pavg m0, m4
+ pavg m1, m5
+ pavg m2, m6
+ pavg m3, m7
%endif
movh [dstq ], m0
movh [dstq+dst_strideq ], m1
@@ -150,7 +207,12 @@
sub r4d, 4
jnz .loop4
RET
+%endif
%endmacro
convolve_fn copy
convolve_fn avg
+%if CONFIG_VP9_HIGHBITDEPTH
+convolve_fn copy, highbd
+convolve_fn avg, highbd
+%endif