shithub: libvpx

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -136,7 +136,7 @@

   specialize qw/vp9_block_error_fp sse2/;

   add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

-  specialize qw/vp9_quantize_fp neon sse2/;

+  specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";

   add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

@@ -11,6 +11,7 @@

 %define private_prefix vp9

 %include "third_party/x86inc/x86inc.asm"

+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"

 SECTION_RODATA

 pw_1: times 8 dw 1

@@ -48,15 +49,15 @@

 %endif

   pxor                            m5, m5                   ; m5 = dedicated zero

-  lea                         coeffq, [  coeffq+ncoeffq*2]

-  lea                            r5q, [  r5q+ncoeffq*2]

-  lea                            r3q, [ r3q+ncoeffq*2]

-  lea                            r4q, [r4q+ncoeffq*2]

+  INCREMENT_ELEMENTS_TRAN_LOW coeffq, ncoeffq

+  lea                            r5q, [r5q+ncoeffq*2]

+  INCREMENT_ELEMENTS_TRAN_LOW    r3q, ncoeffq

+  INCREMENT_ELEMENTS_TRAN_LOW    r4q, ncoeffq

   neg                        ncoeffq

   ; get DC and first 15 AC coeffs

-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]

-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]

+  LOAD_TRAN_LOW  9, coeffq, ncoeffq                        ; m9 = c[i]

+  LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8                    ; m10 = c[i]

   pabsw                           m6, m9                   ; m6 = abs(m9)

   pabsw                          m11, m10                  ; m11 = abs(m10)

   pcmpeqw                         m7, m7

@@ -69,8 +70,8 @@

   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16

   psignw                          m8, m9                   ; m8 = reinsert sign

   psignw                         m13, m10                  ; m13 = reinsert sign

-  mova            [r3q+ncoeffq*2+ 0], m8

-  mova            [r3q+ncoeffq*2+16], m13

+  STORE_TRAN_LOW  8, r3q, ncoeffq,     6, 11, 12

+  STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12

 %ifidn %1, fp_32x32

   pabsw                           m8, m8

   pabsw                          m13, m13

@@ -87,8 +88,8 @@

 %else

   psrlw                           m0, m3, 1

 %endif

-  mova            [r4q+ncoeffq*2+ 0], m8

-  mova            [r4q+ncoeffq*2+16], m13

+  STORE_TRAN_LOW  8, r4q, ncoeffq,     6, 11, 12

+  STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12

   pcmpeqw                         m8, m5                   ; m8 = c[i] == 0

   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0

   mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]

@@ -102,8 +103,8 @@

   jz .accumulate_eob

 .ac_only_loop:

-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]

-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]

+  LOAD_TRAN_LOW  9, coeffq, ncoeffq                        ; m9 = c[i]

+  LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8                    ; m10 = c[i]

   pabsw                           m6, m9                   ; m6 = abs(m9)

   pabsw                          m11, m10                  ; m11 = abs(m10)

@@ -123,8 +124,8 @@

   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16

   psignw                         m14, m9                   ; m14 = reinsert sign

   psignw                         m13, m10                  ; m13 = reinsert sign

-  mova            [r3q+ncoeffq*2+ 0], m14

-  mova            [r3q+ncoeffq*2+16], m13

+  STORE_TRAN_LOW 14, r3q, ncoeffq,     6, 11, 12

+  STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12

 %ifidn %1, fp_32x32

   pabsw                          m14, m14

   pabsw                          m13, m13

@@ -137,8 +138,8 @@

   psignw                         m14, m9

   psignw                         m13, m10

 %endif

-  mova            [r4q+ncoeffq*2+ 0], m14

-  mova            [r4q+ncoeffq*2+16], m13

+  STORE_TRAN_LOW 14, r4q, ncoeffq,     6, 11, 12

+  STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12

   pcmpeqw                        m14, m5                   ; m14 = c[i] == 0

   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0

   mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]

@@ -154,10 +155,10 @@

   jmp .accumulate_eob

 .skip_iter:

-  mova            [r3q+ncoeffq*2+ 0], m5

-  mova            [r3q+ncoeffq*2+16], m5

-  mova            [r4q+ncoeffq*2+ 0], m5

-  mova            [r4q+ncoeffq*2+16], m5

+  STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq

+  STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq + 8

+  STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq

+  STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq + 8

   add                        ncoeffq, mmsize

   jl .ac_only_loop

@@ -186,10 +187,10 @@

   neg                        ncoeffq

   pxor                            m7, m7

 .blank_loop:

-  mova            [r0q+ncoeffq*2+ 0], m7

-  mova            [r0q+ncoeffq*2+16], m7

-  mova            [r2q+ncoeffq*2+ 0], m7

-  mova            [r2q+ncoeffq*2+16], m7

+  STORE_ZERO_TRAN_LOW 7, r0q, ncoeffq

+  STORE_ZERO_TRAN_LOW 7, r0q, ncoeffq + 8

+  STORE_ZERO_TRAN_LOW 7, r2q, ncoeffq

+  STORE_ZERO_TRAN_LOW 7, r2q, ncoeffq + 8

   add                        ncoeffq, mmsize

   jl .blank_loop

   mov                     word [r3q], 0

--- a/vpx_dsp/x86/bitdepth_conversion_sse2.asm

+++ b/vpx_dsp/x86/bitdepth_conversion_sse2.asm

@@ -38,29 +38,53 @@

 ; the values down to 16 bits.

 %macro LOAD_TRAN_LOW 3

 %if CONFIG_VP9_HIGHBITDEPTH

-  mova     m%1, [%2 + %3 * 4]

-  packssdw m%1, [%2 + %3 * 4 + 16]

+  mova     m%1, [%2 + (%3) * 4]

+  packssdw m%1, [%2 + (%3) * 4 + 16]

 %else

-  mova     m%1, [%2 + %3 * 2]

+  mova     m%1, [%2 + (%3) * 2]

 %endif

 %endmacro

 ; Store m%1 to %2 + %3.

 ; %3 is the offset in elements, not bytes.

+; If 5 arguments are provided then m%1 is corrupted.

+; If 6 arguments are provided then m%1 is preserved.

 ; If tran_low_t is 16 bits (low bit depth configuration) then store the value

 ; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign

 ; extend the values first.

 ; Uses m%4-m%6 as scratch registers for high bit depth.

-%macro STORE_TRAN_LOW 5

+%macro STORE_TRAN_LOW 5-6

 %if CONFIG_VP9_HIGHBITDEPTH

   pxor                      m%4, m%4

   mova                      m%5, m%1

+  %if %0 == 6

+  mova                      m%6, m%1

+  %endif

   pcmpgtw                   m%4, m%1

   punpcklwd                 m%5, m%4

+  %if %0 == 5

   punpckhwd                 m%1, m%4

-  mova       [%2 + %3 * 4 +  0], m%5

-  mova       [%2 + %3 * 4 + 16], m%1

+  %else

+  punpckhwd                 m%6, m%4

+  %endif

+  mova     [%2 + (%3) * 4 +  0], m%5

+  %if %0 == 5

+  mova     [%2 + (%3) * 4 + 16], m%1

+  %else

+  mova     [%2 + (%3) * 4 + 16], m%6

+  %endif

 %else

-  mova            [%2 + %3 * 2], m%1

+  mova          [%2 + (%3) * 2], m%1

+%endif

+%endmacro

+; Store zeros (in m%1) to %2 + %3.

+; %3 is the offset in elements, not bytes.

+%macro STORE_ZERO_TRAN_LOW 3

+%if CONFIG_VP9_HIGHBITDEPTH

+  mova     [%2 + (%3) * 4 +  0], m%1

+  mova     [%2 + (%3) * 4 + 16], m%1

+%else

+  mova          [%2 + (%3) * 2], m%1

 %endif

 %endmacro