ref: ce3f4ade670cf02e05998f4ca50e08736802f5e7
parent: 7266bedc041b4bbc3e823226f14d70e97892d959
parent: 37c68efee2daa98d8abc9853d2210477d5c0104f
author: Debargha Mukherjee <[email protected]>
date: Tue Oct 6 18:28:11 EDT 2015
Merge "SSSE3 optimisation for quantize in high bit depth"
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -849,10 +849,10 @@
if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_quantize_b sse2/;
+ specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_quantize_b_32x32/;
+ specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_highbd_quantize_b sse2/;
--- a/vpx_dsp/x86/quantize_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/quantize_ssse3_x86_64.asm
@@ -53,15 +53,31 @@
%endif
pxor m5, m5 ; m5 = dedicated zero
DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+%if CONFIG_VP9_HIGHBITDEPTH
+ lea coeffq, [ coeffq+ncoeffq*4]
+ lea qcoeffq, [ qcoeffq+ncoeffq*4]
+ lea dqcoeffq, [dqcoeffq+ncoeffq*4]
+%else
lea coeffq, [ coeffq+ncoeffq*2]
- lea iscanq, [ iscanq+ncoeffq*2]
lea qcoeffq, [ qcoeffq+ncoeffq*2]
lea dqcoeffq, [dqcoeffq+ncoeffq*2]
+%endif
+ lea iscanq, [ iscanq+ncoeffq*2]
neg ncoeffq
; get DC and first 15 AC coeffs
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; coeff stored as 32bit numbers & require 16bit numbers
+ mova m9, [ coeffq+ncoeffq*4+ 0]
+ mova m6, [ coeffq+ncoeffq*4+16]
+ mova m10, [ coeffq+ncoeffq*4+32]
+ mova m11, [ coeffq+ncoeffq*4+48]
+ packssdw m9, m6 ; m9 = c[i]
+ packssdw m10, m11 ; m10 = c[i]
+%else
mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
+%endif
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
@@ -82,8 +98,28 @@
psignw m13, m10 ; m13 = reinsert sign
pand m8, m7
pand m13, m12
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ mova m11, m8
+ mova m6, m8
+ pcmpgtw m5, m8
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [qcoeffq+ncoeffq*4+ 0], m11
+ mova [qcoeffq+ncoeffq*4+16], m6
+ pxor m5, m5
+ mova m11, m13
+ mova m6, m13
+ pcmpgtw m5, m13
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [qcoeffq+ncoeffq*4+32], m11
+ mova [qcoeffq+ncoeffq*4+48], m6
+ pxor m5, m5 ; reset m5 to zero register
+%else
mova [qcoeffq+ncoeffq*2+ 0], m8
mova [qcoeffq+ncoeffq*2+16], m13
+%endif
%ifidn %1, b_32x32
pabsw m8, m8
pabsw m13, m13
@@ -97,8 +133,28 @@
psignw m8, m9
psignw m13, m10
%endif
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ mova m11, m8
+ mova m6, m8
+ pcmpgtw m5, m8
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [dqcoeffq+ncoeffq*4+ 0], m11
+ mova [dqcoeffq+ncoeffq*4+16], m6
+ pxor m5, m5
+ mova m11, m13
+ mova m6, m13
+ pcmpgtw m5, m13
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [dqcoeffq+ncoeffq*4+32], m11
+ mova [dqcoeffq+ncoeffq*4+48], m6
+ pxor m5, m5 ; reset m5 to zero register
+%else
mova [dqcoeffq+ncoeffq*2+ 0], m8
mova [dqcoeffq+ncoeffq*2+16], m13
+%endif
pcmpeqw m8, m5 ; m8 = c[i] == 0
pcmpeqw m13, m5 ; m13 = c[i] == 0
mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
@@ -112,8 +168,18 @@
jz .accumulate_eob
.ac_only_loop:
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; pack coeff from 32bit to 16bit array
+ mova m9, [ coeffq+ncoeffq*4+ 0]
+ mova m6, [ coeffq+ncoeffq*4+16]
+ mova m10, [ coeffq+ncoeffq*4+32]
+ mova m11, [ coeffq+ncoeffq*4+48]
+ packssdw m9, m6 ; m9 = c[i]
+ packssdw m10, m11 ; m10 = c[i]
+%else
mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
+%endif
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
@@ -136,8 +202,29 @@
psignw m13, m10 ; m13 = reinsert sign
pand m14, m7
pand m13, m12
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ pxor m11, m11
+ mova m11, m14
+ mova m6, m14
+ pcmpgtw m5, m14
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [qcoeffq+ncoeffq*4+ 0], m11
+ mova [qcoeffq+ncoeffq*4+16], m6
+ pxor m5, m5
+ mova m11, m13
+ mova m6, m13
+ pcmpgtw m5, m13
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [qcoeffq+ncoeffq*4+32], m11
+ mova [qcoeffq+ncoeffq*4+48], m6
+ pxor m5, m5 ; reset m5 to zero register
+%else
mova [qcoeffq+ncoeffq*2+ 0], m14
mova [qcoeffq+ncoeffq*2+16], m13
+%endif
%ifidn %1, b_32x32
pabsw m14, m14
pabsw m13, m13
@@ -150,8 +237,28 @@
psignw m14, m9
psignw m13, m10
%endif
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ mova m11, m14
+ mova m6, m14
+ pcmpgtw m5, m14
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [dqcoeffq+ncoeffq*4+ 0], m11
+ mova [dqcoeffq+ncoeffq*4+16], m6
+ pxor m5, m5
+ mova m11, m13
+ mova m6, m13
+ pcmpgtw m5, m13
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [dqcoeffq+ncoeffq*4+32], m11
+ mova [dqcoeffq+ncoeffq*4+48], m6
+ pxor m5, m5
+%else
mova [dqcoeffq+ncoeffq*2+ 0], m14
mova [dqcoeffq+ncoeffq*2+16], m13
+%endif
pcmpeqw m14, m5 ; m14 = c[i] == 0
pcmpeqw m13, m5 ; m13 = c[i] == 0
mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
@@ -168,10 +275,21 @@
%ifidn %1, b_32x32
jmp .accumulate_eob
.skip_iter:
+%if CONFIG_VP9_HIGHBITDEPTH
+ mova [qcoeffq+ncoeffq*4+ 0], m5
+ mova [qcoeffq+ncoeffq*4+16], m5
+ mova [qcoeffq+ncoeffq*4+32], m5
+ mova [qcoeffq+ncoeffq*4+48], m5
+ mova [dqcoeffq+ncoeffq*4+ 0], m5
+ mova [dqcoeffq+ncoeffq*4+16], m5
+ mova [dqcoeffq+ncoeffq*4+32], m5
+ mova [dqcoeffq+ncoeffq*4+48], m5
+%else
mova [qcoeffq+ncoeffq*2+ 0], m5
mova [qcoeffq+ncoeffq*2+16], m5
mova [dqcoeffq+ncoeffq*2+ 0], m5
mova [dqcoeffq+ncoeffq*2+16], m5
+%endif
add ncoeffq, mmsize
jl .ac_only_loop
%endif
@@ -196,15 +314,31 @@
mov r2, qcoeffmp
mov r3, eobmp
DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
+%if CONFIG_VP9_HIGHBITDEPTH
+ lea dqcoeffq, [dqcoeffq+ncoeffq*4]
+ lea qcoeffq, [ qcoeffq+ncoeffq*4]
+%else
lea dqcoeffq, [dqcoeffq+ncoeffq*2]
lea qcoeffq, [ qcoeffq+ncoeffq*2]
+%endif
neg ncoeffq
pxor m7, m7
.blank_loop:
+%if CONFIG_VP9_HIGHBITDEPTH
+ mova [dqcoeffq+ncoeffq*4+ 0], m7
+ mova [dqcoeffq+ncoeffq*4+16], m7
+ mova [dqcoeffq+ncoeffq*4+32], m7
+ mova [dqcoeffq+ncoeffq*4+48], m7
+ mova [qcoeffq+ncoeffq*4+ 0], m7
+ mova [qcoeffq+ncoeffq*4+16], m7
+ mova [qcoeffq+ncoeffq*4+32], m7
+ mova [qcoeffq+ncoeffq*4+48], m7
+%else
mova [dqcoeffq+ncoeffq*2+ 0], m7
mova [dqcoeffq+ncoeffq*2+16], m7
mova [qcoeffq+ncoeffq*2+ 0], m7
mova [qcoeffq+ncoeffq*2+16], m7
+%endif
add ncoeffq, mmsize
jl .blank_loop
mov word [eobq], 0