shithub: libvpx

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -714,6 +714,9 @@

 add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";

 specialize qw/vp9_subtract_block/, "$sse2_x86inc";

+add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

+specialize qw/vp9_quantize_fp/, "$ssse3_x86_64";

 add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

 specialize qw/vp9_quantize_b/, "$ssse3_x86_64";

--- a/vp9/encoder/vp9_block.h

+++ b/vp9/encoder/vp9_block.h

@@ -35,6 +35,7 @@

   // Quantizer setings

   int16_t *quant_fp;

+  int16_t *round_fp;

   int16_t *quant;

   int16_t *quant_shift;

   int16_t *zbin;

@@ -109,6 +110,9 @@

   // indicate if it is in the rd search loop or encoding process

   int use_lp32x32fdct;

   int skip_encode;

+  // use fast quantization process

+  int quant_fp;

   // skip forward transform and quantization

   int skip_txfm;

--- a/vp9/encoder/vp9_encodeframe.c

+++ b/vp9/encoder/vp9_encodeframe.c

@@ -3074,6 +3074,7 @@

   init_encode_frame_mb_context(cpi);

   set_prev_mi(cm);

+  x->quant_fp = cpi->sf.use_quant_fp;

   x->skip_txfm = 0;

   if (sf->use_nonrd_pick_mode) {

     // Initialize internal buffer pointers for rtc coding, where non-RD

--- a/vp9/encoder/vp9_encodemb.c

+++ b/vp9/encoder/vp9_encodemb.c

@@ -306,6 +306,7 @@

   MACROBLOCKD *const xd = &x->e_mbd;

   const struct macroblock_plane *const p = &x->plane[plane];

   const struct macroblockd_plane *const pd = &xd->plane[plane];

+  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];

   int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);

   int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);

   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

@@ -313,7 +314,56 @@

   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];

   int i, j;

   const int16_t *src_diff;

+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);

+  src_diff = &p->src_diff[4 * (j * diff_stride + i)];

+  switch (tx_size) {

+    case TX_32X32:

+      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);

+      vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,

+                           p->quant, p->quant_shift, qcoeff, dqcoeff,

+                           pd->dequant, p->zbin_extra, eob, scan_order->scan,

+                           scan_order->iscan);

+      break;

+    case TX_16X16:

+      vp9_fdct16x16(src_diff, coeff, diff_stride);

+      vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,

+                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,

+                      pd->dequant, p->zbin_extra, eob,

+                      scan_order->scan, scan_order->iscan);

+      break;

+    case TX_8X8:

+      vp9_fdct8x8(src_diff, coeff, diff_stride);

+      vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,

+                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,

+                      pd->dequant, p->zbin_extra, eob,

+                      scan_order->scan, scan_order->iscan);

+      break;

+    case TX_4X4:

+      x->fwd_txm4x4(src_diff, coeff, diff_stride);

+      vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,

+                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,

+                      pd->dequant, p->zbin_extra, eob,

+                      scan_order->scan, scan_order->iscan);

+      break;

+    default:

+      assert(0);

+  }

+}

+void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,

+                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  const struct macroblock_plane *const p = &x->plane[plane];

+  const struct macroblockd_plane *const pd = &xd->plane[plane];

+  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);

+  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);

+  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

+  uint16_t *const eob = &p->eobs[block];

+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];

+  int i, j;

+  const int16_t *src_diff;

   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);

   src_diff = &p->src_diff[4 * (j * diff_stride + i)];

@@ -424,11 +474,15 @@

   if (x->skip_txfm == 0) {

     // full forward transform and quantization

-    if (!x->skip_recode)

-      vp9_xform_quant(x, plane, block, plane_bsize, tx_size);

+    if (!x->skip_recode) {

+      if (x->quant_fp)

+        vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);

+      else

+        vp9_xform_quant(x, plane, block, plane_bsize, tx_size);

+    }

   } else if (x->skip_txfm == 2) {

     // fast path forward transform and quantization

-    vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);

+    vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);

   } else {

     // skip forward transform

     p->eobs[block] = 0;

--- a/vp9/encoder/vp9_encodemb.h

+++ b/vp9/encoder/vp9_encodemb.h

@@ -24,6 +24,8 @@

 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);

 void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,

                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size);

+void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,

+                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size);

 void vp9_xform_quant(MACROBLOCK *x, int plane, int block,

                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size);

--- a/vp9/encoder/vp9_quantize.c

+++ b/vp9/encoder/vp9_quantize.c

@@ -42,9 +42,9 @@

 void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,

-                     const int16_t *round_ptr, const int16_t quant,

-                     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,

-                     const int16_t dequant_ptr, uint16_t *eob_ptr) {

+                           const int16_t *round_ptr, const int16_t quant,

+                           int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,

+                           const int16_t dequant_ptr, uint16_t *eob_ptr) {

   int eob = -1;

   if (!skip_block) {

@@ -63,6 +63,47 @@

   *eob_ptr = eob + 1;

+void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count,

+                       int skip_block,

+                       const int16_t *zbin_ptr, const int16_t *round_ptr,

+                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,

+                       int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,

+                       const int16_t *dequant_ptr,

+                       int zbin_oq_value, uint16_t *eob_ptr,

+                       const int16_t *scan, const int16_t *iscan) {

+  int i, eob = -1;

+  // TODO(jingning) Decide the need of these arguments after the

+  // quantization process is completed.

+  (void)zbin_ptr;

+  (void)quant_shift_ptr;

+  (void)zbin_oq_value;

+  (void)iscan;

+  vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));

+  vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));

+  if (!skip_block) {

+    // Quantization pass: All coefficients with index >= zero_flag are

+    // skippable. Note: zero_flag can be zero.

+    for (i = 0; i < count; i++) {

+      const int rc = scan[i];

+      const int coeff = coeff_ptr[rc];

+      const int coeff_sign = (coeff >> 31);

+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

+      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);

+      tmp = (tmp * quant_ptr[rc != 0]) >> 16;

+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;

+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];

+      if (tmp)

+        eob = i;

+    }

+  }

+  *eob_ptr = eob + 1;

+}

 void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,

                       int skip_block,

                       const int16_t *zbin_ptr, const int16_t *round_ptr,

@@ -207,11 +248,16 @@

     const int qrounding_factor = q == 0 ? 64 : 48;

     for (i = 0; i < 2; ++i) {

+      int qrounding_factor_fp = i == 0 ? 48 : 42;

+      if (q == 0)

+        qrounding_factor_fp = 64;

       // y

       quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q)

                      : vp9_ac_quant(q, 0);

       invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant);

       quants->y_quant_fp[q][i] = (1 << 16) / quant;

+      quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;

       quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);

       quants->y_round[q][i] = (qrounding_factor * quant) >> 7;

       cm->y_dequant[q][i] = quant;

@@ -222,6 +268,7 @@

       invert_quant(&quants->uv_quant[q][i],

                    &quants->uv_quant_shift[q][i], quant);

       quants->uv_quant_fp[q][i] = (1 << 16) / quant;

+      quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;

       quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);

       quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;

       cm->uv_dequant[q][i] = quant;

@@ -240,6 +287,7 @@

     for (i = 2; i < 8; i++) {

       quants->y_quant[q][i] = quants->y_quant[q][1];

       quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];

+      quants->y_round_fp[q][i] = quants->y_round_fp[q][1];

       quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];

       quants->y_zbin[q][i] = quants->y_zbin[q][1];

       quants->y_round[q][i] = quants->y_round[q][1];

@@ -247,6 +295,7 @@

       quants->uv_quant[q][i] = quants->uv_quant[q][1];

       quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1];

+      quants->uv_round_fp[q][i] = quants->uv_round_fp[q][1];

       quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];

       quants->uv_zbin[q][i] = quants->uv_zbin[q][1];

       quants->uv_round[q][i] = quants->uv_round[q][1];

@@ -276,6 +325,7 @@

   // Y

   x->plane[0].quant = quants->y_quant[qindex];

   x->plane[0].quant_fp = quants->y_quant_fp[qindex];

+  x->plane[0].round_fp = quants->y_round_fp[qindex];

   x->plane[0].quant_shift = quants->y_quant_shift[qindex];

   x->plane[0].zbin = quants->y_zbin[qindex];

   x->plane[0].round = quants->y_round[qindex];

@@ -286,6 +336,7 @@

   for (i = 1; i < 3; i++) {

     x->plane[i].quant = quants->uv_quant[qindex];

     x->plane[i].quant_fp = quants->uv_quant_fp[qindex];

+    x->plane[i].round_fp = quants->uv_round_fp[qindex];

     x->plane[i].quant_shift = quants->uv_quant_shift[qindex];

     x->plane[i].zbin = quants->uv_zbin[qindex];

     x->plane[i].round = quants->uv_round[qindex];

--- a/vp9/encoder/vp9_quantize.h

+++ b/vp9/encoder/vp9_quantize.h

@@ -28,6 +28,8 @@

   // if we want to deprecate the current use of y_quant.

   DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);

   DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]);

+  DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]);

+  DECLARE_ALIGNED(16, int16_t, uv_round_fp[QINDEX_RANGE][8]);

   DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);

   DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);

--- a/vp9/encoder/vp9_speed_features.c

+++ b/vp9/encoder/vp9_speed_features.c

@@ -279,6 +279,7 @@

     sf->reuse_inter_pred_sby = 1;

   if (speed >= 7) {

+    sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1;

     sf->lpf_pick = LPF_PICK_MINIMAL_LPF;

     sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ?

         800 : 300;

@@ -314,6 +315,7 @@

   sf->use_lp32x32fdct = 0;

   sf->adaptive_motion_search = 0;

   sf->adaptive_pred_interp_filter = 0;

+  sf->use_quant_fp = 0;

   sf->reference_masking = 0;

   sf->partition_search_type = SEARCH_PARTITION;

   sf->less_rectangular_check = 0;

--- a/vp9/encoder/vp9_speed_features.h

+++ b/vp9/encoder/vp9_speed_features.h

@@ -284,6 +284,9 @@

   // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.

   int adaptive_pred_interp_filter;

+  // Fast quantization process path

+  int use_quant_fp;

   // Search through variable block partition types in non-RD mode decision

   // encoding process for RTC.

   int partition_check;

--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

@@ -217,3 +217,185 @@

 INIT_XMM ssse3

 QUANTIZE_FN b, 7

 QUANTIZE_FN b_32x32, 7

+%macro QUANTIZE_FP 2

+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \

+                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \

+                                eob, scan, iscan

+  cmp                    dword skipm, 0

+  jne .blank

+  ; actual quantize loop - setup pointers, rounders, etc.

+  movifnidn                   coeffq, coeffmp

+  movifnidn                  ncoeffq, ncoeffmp

+  mov                             r2, dequantmp

+  movifnidn                    zbinq, zbinmp

+  movifnidn                   roundq, roundmp

+  movifnidn                   quantq, quantmp

+  mova                            m1, [roundq]             ; m1 = round

+  mova                            m2, [quantq]             ; m2 = quant

+%ifidn %1, b_32x32

+; TODO(jingning) to be continued with 32x32 quantization process

+  pcmpeqw                         m5, m5

+  psrlw                           m5, 15

+  paddw                           m0, m5

+  paddw                           m1, m5

+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2

+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2

+%endif

+  mova                            m3, [r2q]                ; m3 = dequant

+  mov                             r3, qcoeffmp

+  mov                             r4, dqcoeffmp

+  mov                             r5, iscanmp

+%ifidn %1, b_32x32

+  psllw                           m4, 1

+%endif

+  pxor                            m5, m5                   ; m5 = dedicated zero

+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob

+  lea                         coeffq, [  coeffq+ncoeffq*2]

+  lea                         iscanq, [  iscanq+ncoeffq*2]

+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]

+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]

+  neg                        ncoeffq

+  ; get DC and first 15 AC coeffs

+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]

+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]

+  pabsw                           m6, m9                   ; m6 = abs(m9)

+  pabsw                          m11, m10                  ; m11 = abs(m10)

+  pcmpeqw                         m7, m7

+  pcmpeqw                        m12, m12

+  paddsw                          m6, m1                   ; m6 += round

+  punpckhqdq                      m1, m1

+  paddsw                         m11, m1                   ; m11 += round

+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16

+  punpckhqdq                      m2, m2

+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16

+  psignw                          m8, m9                   ; m8 = reinsert sign

+  psignw                         m13, m10                  ; m13 = reinsert sign

+  mova        [qcoeffq+ncoeffq*2+ 0], m8

+  mova        [qcoeffq+ncoeffq*2+16], m13

+%ifidn %1, b_32x32

+  pabsw                           m8, m8

+  pabsw                          m13, m13

+%endif

+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q

+  punpckhqdq                      m3, m3

+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q

+%ifidn %1, b_32x32

+  psrlw                           m8, 1

+  psrlw                          m13, 1

+  psignw                          m8, m9

+  psignw                         m13, m10

+%endif

+  mova       [dqcoeffq+ncoeffq*2+ 0], m8

+  mova       [dqcoeffq+ncoeffq*2+16], m13

+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0

+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0

+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]

+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]

+  psubw                           m6, m7                   ; m6 = scan[i] + 1

+  psubw                          m11, m12                  ; m11 = scan[i] + 1

+  pandn                           m8, m6                   ; m8 = max(eob)

+  pandn                          m13, m11                  ; m13 = max(eob)

+  pmaxsw                          m8, m13

+  add                        ncoeffq, mmsize

+  jz .accumulate_eob

+.ac_only_loop:

+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]

+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]

+  pabsw                           m6, m9                   ; m6 = abs(m9)

+  pabsw                          m11, m10                  ; m11 = abs(m10)

+  pcmpeqw                         m7, m7

+  pcmpeqw                        m12, m12

+%ifidn %1, b_32x32

+  pmovmskb                        r6, m7

+  pmovmskb                        r2, m12

+  or                              r6, r2

+  jz .skip_iter

+%endif

+  paddsw                          m6, m1                   ; m6 += round

+  paddsw                         m11, m1                   ; m11 += round

+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16

+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16

+  psignw                         m14, m9                   ; m14 = reinsert sign

+  psignw                         m13, m10                  ; m13 = reinsert sign

+  mova        [qcoeffq+ncoeffq*2+ 0], m14

+  mova        [qcoeffq+ncoeffq*2+16], m13

+%ifidn %1, b_32x32

+  pabsw                          m14, m14

+  pabsw                          m13, m13

+%endif

+  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q

+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q

+%ifidn %1, b_32x32

+  psrlw                          m14, 1

+  psrlw                          m13, 1

+  psignw                         m14, m9

+  psignw                         m13, m10

+%endif

+  mova       [dqcoeffq+ncoeffq*2+ 0], m14

+  mova       [dqcoeffq+ncoeffq*2+16], m13

+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0

+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0

+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]

+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]

+  psubw                           m6, m7                   ; m6 = scan[i] + 1

+  psubw                          m11, m12                  ; m11 = scan[i] + 1

+  pandn                          m14, m6                   ; m14 = max(eob)

+  pandn                          m13, m11                  ; m13 = max(eob)

+  pmaxsw                          m8, m14

+  pmaxsw                          m8, m13

+  add                        ncoeffq, mmsize

+  jl .ac_only_loop

+%ifidn %1, b_32x32

+  jmp .accumulate_eob

+.skip_iter:

+  mova        [qcoeffq+ncoeffq*2+ 0], m5

+  mova        [qcoeffq+ncoeffq*2+16], m5

+  mova       [dqcoeffq+ncoeffq*2+ 0], m5

+  mova       [dqcoeffq+ncoeffq*2+16], m5

+  add                        ncoeffq, mmsize

+  jl .ac_only_loop

+%endif

+.accumulate_eob:

+  ; horizontally accumulate/max eobs and write into [eob] memory pointer

+  mov                             r2, eobmp

+  pshufd                          m7, m8, 0xe

+  pmaxsw                          m8, m7

+  pshuflw                         m7, m8, 0xe

+  pmaxsw                          m8, m7

+  pshuflw                         m7, m8, 0x1

+  pmaxsw                          m8, m7

+  pextrw                          r6, m8, 0

+  mov                             [r2], r6

+  RET

+  ; skip-block, i.e. just write all zeroes

+.blank:

+  mov                             r0, dqcoeffmp

+  movifnidn                  ncoeffq, ncoeffmp

+  mov                             r2, qcoeffmp

+  mov                             r3, eobmp

+  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob

+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]

+  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]

+  neg                        ncoeffq

+  pxor                            m7, m7

+.blank_loop:

+  mova       [dqcoeffq+ncoeffq*2+ 0], m7

+  mova       [dqcoeffq+ncoeffq*2+16], m7

+  mova        [qcoeffq+ncoeffq*2+ 0], m7

+  mova        [qcoeffq+ncoeffq*2+16], m7

+  add                        ncoeffq, mmsize

+  jl .blank_loop

+  mov                    word [eobq], 0

+  RET

+%endmacro

+INIT_XMM ssse3

+QUANTIZE_FP fp, 7