shithub: libvpx

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -717,6 +717,9 @@

 add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

 specialize qw/vp9_quantize_fp/, "$ssse3_x86_64";

+add_proto qw/void vp9_quantize_fp_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";

 add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

 specialize qw/vp9_quantize_b/, "$ssse3_x86_64";

--- a/vp9/encoder/vp9_encodemb.c

+++ b/vp9/encoder/vp9_encodemb.c

@@ -320,10 +320,10 @@

   switch (tx_size) {

     case TX_32X32:

       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);

-      vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,

-                           p->quant, p->quant_shift, qcoeff, dqcoeff,

-                           pd->dequant, p->zbin_extra, eob, scan_order->scan,

-                           scan_order->iscan);

+      vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,

+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,

+                            pd->dequant, p->zbin_extra, eob, scan_order->scan,

+                            scan_order->iscan);

       break;

     case TX_16X16:

       vp9_fdct16x16(src_diff, coeff, diff_stride);

--- a/vp9/encoder/vp9_quantize.c

+++ b/vp9/encoder/vp9_quantize.c

@@ -104,6 +104,49 @@

   *eob_ptr = eob + 1;

+// TODO(jingning) Refactor this file and combine functions with similar

+// operations.

+void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,

+                             int skip_block,

+                             const int16_t *zbin_ptr, const int16_t *round_ptr,

+                             const int16_t *quant_ptr,

+                             const int16_t *quant_shift_ptr,

+                             int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,

+                             const int16_t *dequant_ptr,

+                             int zbin_oq_value, uint16_t *eob_ptr,

+                             const int16_t *scan, const int16_t *iscan) {

+  int i, eob = -1;

+  (void)zbin_ptr;

+  (void)quant_shift_ptr;

+  (void)zbin_oq_value;

+  (void)iscan;

+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t));

+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t));

+  if (!skip_block) {

+    for (i = 0; i < n_coeffs; i++) {

+      const int rc = scan[i];

+      const int coeff = coeff_ptr[rc];

+      const int coeff_sign = (coeff >> 31);

+      int tmp = 0;

+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

+      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {

+        abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);

+        abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);

+        tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15;

+        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;

+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;

+      }

+      if (tmp)

+        eob = i;

+    }

+  }

+  *eob_ptr = eob + 1;

+}

 void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,

                       int skip_block,

                       const int16_t *zbin_ptr, const int16_t *round_ptr,

--- a/vp9/encoder/vp9_speed_features.c

+++ b/vp9/encoder/vp9_speed_features.c

@@ -250,6 +250,7 @@

   if (speed >= 5) {

+    sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1;

     sf->auto_min_max_partition_size = (cm->frame_type == KEY_FRAME) ?

         RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;

     sf->max_partition_size = BLOCK_32X32;

@@ -282,7 +283,6 @@

     sf->elevate_newmv_thresh = 2000;

   if (speed >= 7) {

-    sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1;

     sf->mv.fullpel_search_step_param = 10;

     sf->lpf_pick = LPF_PICK_MINIMAL_LPF;

     sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ?

--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

@@ -234,13 +234,10 @@

   movifnidn                   quantq, quantmp

   mova                            m1, [roundq]             ; m1 = round

   mova                            m2, [quantq]             ; m2 = quant

-%ifidn %1, b_32x32

-; TODO(jingning) to be continued with 32x32 quantization process

+%ifidn %1, fp_32x32

   pcmpeqw                         m5, m5

   psrlw                           m5, 15

-  paddw                           m0, m5

   paddw                           m1, m5

-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2

   psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2

 %endif

   mova                            m3, [r2q]                ; m3 = dequant

@@ -247,8 +244,8 @@

   mov                             r3, qcoeffmp

   mov                             r4, dqcoeffmp

   mov                             r5, iscanmp

-%ifidn %1, b_32x32

-  psllw                           m4, 1

+%ifidn %1, fp_32x32

+  psllw                           m2, 1

 %endif

   pxor                            m5, m5                   ; m5 = dedicated zero

   DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob

@@ -275,7 +272,7 @@

   psignw                         m13, m10                  ; m13 = reinsert sign

   mova        [qcoeffq+ncoeffq*2+ 0], m8

   mova        [qcoeffq+ncoeffq*2+16], m13

-%ifidn %1, b_32x32

+%ifidn %1, fp_32x32

   pabsw                           m8, m8

   pabsw                          m13, m13

 %endif

@@ -282,11 +279,12 @@

   pmullw                          m8, m3                   ; dqc[i] = qc[i] * q

   punpckhqdq                      m3, m3

   pmullw                         m13, m3                   ; dqc[i] = qc[i] * q

-%ifidn %1, b_32x32

+%ifidn %1, fp_32x32

   psrlw                           m8, 1

   psrlw                          m13, 1

   psignw                          m8, m9

   psignw                         m13, m10

+  psrlw                           m0, m3, 2

 %endif

   mova       [dqcoeffq+ncoeffq*2+ 0], m8

   mova       [dqcoeffq+ncoeffq*2+16], m13

@@ -307,13 +305,17 @@

   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]

   pabsw                           m6, m9                   ; m6 = abs(m9)

   pabsw                          m11, m10                  ; m11 = abs(m10)

-  pcmpeqw                         m7, m7

-%ifidn %1, b_32x32

+%ifidn %1, fp_32x32

+  pcmpgtw                         m7, m6,  m0

+  pcmpgtw                        m12, m11, m0

   pmovmskb                        r6, m7

-  pmovmskb                        r2, m7

+  pmovmskb                        r2, m12

   or                              r6, r2

   jz .skip_iter

 %endif

+  pcmpeqw                         m7, m7

   paddsw                          m6, m1                   ; m6 += round

   paddsw                         m11, m1                   ; m11 += round

   pmulhw                         m14, m6, m2               ; m14 = m6*q>>16

@@ -322,13 +324,13 @@

   psignw                         m13, m10                  ; m13 = reinsert sign

   mova        [qcoeffq+ncoeffq*2+ 0], m14

   mova        [qcoeffq+ncoeffq*2+16], m13

-%ifidn %1, b_32x32

+%ifidn %1, fp_32x32

   pabsw                          m14, m14

   pabsw                          m13, m13

 %endif

   pmullw                         m14, m3                   ; dqc[i] = qc[i] * q

   pmullw                         m13, m3                   ; dqc[i] = qc[i] * q

-%ifidn %1, b_32x32

+%ifidn %1, fp_32x32

   psrlw                          m14, 1

   psrlw                          m13, 1

   psignw                         m14, m9

@@ -349,7 +351,7 @@

   add                        ncoeffq, mmsize

   jl .ac_only_loop

-%ifidn %1, b_32x32

+%ifidn %1, fp_32x32

   jmp .accumulate_eob

 .skip_iter:

   mova        [qcoeffq+ncoeffq*2+ 0], m5

@@ -397,3 +399,4 @@

 INIT_XMM ssse3

 QUANTIZE_FP fp, 7

+QUANTIZE_FP fp_32x32, 7