shithub: libvpx

--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c

+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c

@@ -8,6 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include <assert.h>

 #include <emmintrin.h>

 #include "vpx_dsp/vpx_dsp_common.h"

@@ -37,54 +38,54 @@

   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);

   (void)scan;

+  (void)skip_block;

+  assert(!skip_block);

   memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));

   memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));

-  if (!skip_block) {

-    // Pre-scan pass

-    for (i = ((int)count / 4) - 1; i >= 0; i--) {

-      __m128i coeffs, cmp1, cmp2;

-      int test;

-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

-      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);

-      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);

-      cmp1 = _mm_and_si128(cmp1, cmp2);

-      test = _mm_movemask_epi8(cmp1);

-      if (test == 0xffff)

-        non_zero_regs--;

-      else

-        break;

-    }

+  // Pre-scan pass

+  for (i = ((int)count / 4) - 1; i >= 0; i--) {

+    __m128i coeffs, cmp1, cmp2;

+    int test;

+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);

+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);

+    cmp1 = _mm_and_si128(cmp1, cmp2);

+    test = _mm_movemask_epi8(cmp1);

+    if (test == 0xffff)

+      non_zero_regs--;

+    else

+      break;

+  }

-    // Quantization pass:

-    for (i = 0; i < non_zero_regs; i++) {

-      __m128i coeffs, coeffs_sign, tmp1, tmp2;

-      int test;

-      int abs_coeff[4];

-      int coeff_sign[4];

+  // Quantization pass:

+  for (i = 0; i < non_zero_regs; i++) {

+    __m128i coeffs, coeffs_sign, tmp1, tmp2;

+    int test;

+    int abs_coeff[4];

+    int coeff_sign[4];

-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

-      coeffs_sign = _mm_srai_epi32(coeffs, 31);

-      coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);

-      tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);

-      tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);

-      tmp1 = _mm_or_si128(tmp1, tmp2);

-      test = _mm_movemask_epi8(tmp1);

-      _mm_storeu_si128((__m128i *)abs_coeff, coeffs);

-      _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);

+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

+    coeffs_sign = _mm_srai_epi32(coeffs, 31);

+    coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);

+    tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);

+    tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);

+    tmp1 = _mm_or_si128(tmp1, tmp2);

+    test = _mm_movemask_epi8(tmp1);

+    _mm_storeu_si128((__m128i *)abs_coeff, coeffs);

+    _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);

-      for (j = 0; j < 4; j++) {

-        if (test & (1 << (4 * j))) {

-          int k = 4 * i + j;

-          const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];

-          const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;

-          const uint32_t abs_qcoeff =

-              (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);

-          qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];

-          dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];

-          if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;

-        }

+    for (j = 0; j < 4; j++) {

+      if (test & (1 << (4 * j))) {

+        int k = 4 * i + j;

+        const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];

+        const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;

+        const uint32_t abs_qcoeff =

+            (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);

+        qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];

+        dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];

+        if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;

@@ -105,6 +106,9 @@

   const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);

   const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);

   (void)scan;

+  (void)skip_block;

+  assert(!skip_block);

   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);

   zbins[1] = _mm_set1_epi32(zbin1_tmp);

@@ -116,38 +120,35 @@

   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));

   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));

-  if (!skip_block) {

-    // Pre-scan pass

-    for (i = 0; i < n_coeffs / 4; i++) {

-      __m128i coeffs, cmp1, cmp2;

-      int test;

-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

-      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);

-      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);

-      cmp1 = _mm_and_si128(cmp1, cmp2);

-      test = _mm_movemask_epi8(cmp1);

-      if (!(test & 0xf)) idx_arr[idx++] = i * 4;

-      if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;

-      if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;

-      if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;

-    }

+  // Pre-scan pass

+  for (i = 0; i < n_coeffs / 4; i++) {

+    __m128i coeffs, cmp1, cmp2;

+    int test;

+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));

+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);

+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);

+    cmp1 = _mm_and_si128(cmp1, cmp2);

+    test = _mm_movemask_epi8(cmp1);

+    if (!(test & 0xf)) idx_arr[idx++] = i * 4;

+    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;

+    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;

+    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;

+  }

-    // Quantization pass: only process the coefficients selected in

-    // pre-scan pass. Note: idx can be zero.

-    for (i = 0; i < idx; i++) {

-      const int rc = idx_arr[i];

-      const int coeff = coeff_ptr[rc];

-      const int coeff_sign = (coeff >> 31);

-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

-      const int64_t tmp1 =

-          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);

-      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;

-      const uint32_t abs_qcoeff =

-          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);

-      qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;

-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;

-      if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;

-    }

+  // Quantization pass: only process the coefficients selected in

+  // pre-scan pass. Note: idx can be zero.

+  for (i = 0; i < idx; i++) {

+    const int rc = idx_arr[i];

+    const int coeff = coeff_ptr[rc];

+    const int coeff_sign = (coeff >> 31);

+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;

+    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);

+    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;

+    const uint32_t abs_qcoeff =

+        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);

+    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;

+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;

+    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;

   *eob_ptr = eob + 1;

--- a/vpx_dsp/x86/quantize_avx_x86_64.asm

+++ b/vpx_dsp/x86/quantize_avx_x86_64.asm

@@ -19,10 +19,6 @@

   vzeroupper

-  ; If we can skip this block, then just zero the output

-  cmp                         skipmp, 0

-  jne .blank

 %ifnidn %1, b_32x32

   ; Special case for ncoeff == 16, as it is frequent and we can save on

@@ -491,48 +487,6 @@

   pmaxsw                          m8, m7

   movq                           rax, m8

   mov                           [r2], ax

-  vzeroupper

-  RET

-  ; Skip-block, i.e. just write all zeroes

-.blank:

-DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \

-            qcoeff, dqcoeff, dequant, eob, scan, iscan

-  mov                             r0, dqcoeffmp

-  movifnidn                  ncoeffq, ncoeffmp

-  mov                             r2, qcoeffmp

-  mov                             r3, eobmp

-DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob

-%if CONFIG_VP9_HIGHBITDEPTH

-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]

-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]

-%else

-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]

-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]

-%endif

-  neg                        ncoeffq

-  pxor                            m7, m7

-.blank_loop:

-%if CONFIG_VP9_HIGHBITDEPTH

-  mova       [dqcoeffq+ncoeffq*4+ 0], ymm7

-  mova       [dqcoeffq+ncoeffq*4+32], ymm7

-  mova        [qcoeffq+ncoeffq*4+ 0], ymm7

-  mova        [qcoeffq+ncoeffq*4+32], ymm7

-%else

-  mova       [dqcoeffq+ncoeffq*2+ 0], ymm7

-  mova        [qcoeffq+ncoeffq*2+ 0], ymm7

-%endif

-  add                        ncoeffq, mmsize

-  jl .blank_loop

-  mov                         [eobq], word 0

   vzeroupper

RET

 %endmacro

--- a/vpx_dsp/x86/quantize_sse2.c

+++ b/vpx_dsp/x86/quantize_sse2.c

@@ -8,6 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include <assert.h>

 #include <emmintrin.h>

 #include <xmmintrin.h>

@@ -23,7 +24,12 @@

                          uint16_t *eob_ptr, const int16_t *scan_ptr,

                          const int16_t *iscan_ptr) {

   __m128i zero;

+  __m128i eob;

+  __m128i zbin;

+  __m128i round, quant, dequant, shift;

   (void)scan_ptr;

+  (void)skip_block;

+  assert(!skip_block);

   coeff_ptr += n_coeffs;

   iscan_ptr += n_coeffs;

@@ -31,193 +37,179 @@

   dqcoeff_ptr += n_coeffs;

   n_coeffs = -n_coeffs;

   zero = _mm_setzero_si128();

-  if (!skip_block) {

-    __m128i eob;

-    __m128i zbin;

-    __m128i round, quant, dequant, shift;

+  {

+    __m128i coeff0, coeff1;

+    // Setup global values

-      __m128i coeff0, coeff1;

+      __m128i pw_1;

+      zbin = _mm_load_si128((const __m128i *)zbin_ptr);

+      round = _mm_load_si128((const __m128i *)round_ptr);

+      quant = _mm_load_si128((const __m128i *)quant_ptr);

+      pw_1 = _mm_set1_epi16(1);

+      zbin = _mm_sub_epi16(zbin, pw_1);

+      dequant = _mm_load_si128((const __m128i *)dequant_ptr);

+      shift = _mm_load_si128((const __m128i *)quant_shift_ptr);

+    }

-      // Setup global values

-      {

-        __m128i pw_1;

-        zbin = _mm_load_si128((const __m128i *)zbin_ptr);

-        round = _mm_load_si128((const __m128i *)round_ptr);

-        quant = _mm_load_si128((const __m128i *)quant_ptr);

-        pw_1 = _mm_set1_epi16(1);

-        zbin = _mm_sub_epi16(zbin, pw_1);

-        dequant = _mm_load_si128((const __m128i *)dequant_ptr);

-        shift = _mm_load_si128((const __m128i *)quant_shift_ptr);

-      }

+    {

+      __m128i coeff0_sign, coeff1_sign;

+      __m128i qcoeff0, qcoeff1;

+      __m128i qtmp0, qtmp1;

+      __m128i cmp_mask0, cmp_mask1;

+      // Do DC and first 15 AC

+      coeff0 = load_tran_low(coeff_ptr + n_coeffs);

+      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);

-      {

-        __m128i coeff0_sign, coeff1_sign;

-        __m128i qcoeff0, qcoeff1;

-        __m128i qtmp0, qtmp1;

-        __m128i cmp_mask0, cmp_mask1;

-        // Do DC and first 15 AC

-        coeff0 = load_tran_low(coeff_ptr + n_coeffs);

-        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);

+      // Poor man's sign extract

+      coeff0_sign = _mm_srai_epi16(coeff0, 15);

+      coeff1_sign = _mm_srai_epi16(coeff1, 15);

+      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

+      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-        // Poor man's sign extract

-        coeff0_sign = _mm_srai_epi16(coeff0, 15);

-        coeff1_sign = _mm_srai_epi16(coeff1, 15);

-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

+      cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);

+      zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC

+      cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);

+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);

+      round = _mm_unpackhi_epi64(round, round);

+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);

+      qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

+      quant = _mm_unpackhi_epi64(quant, quant);

+      qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

+      qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);

+      qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);

+      qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

+      shift = _mm_unpackhi_epi64(shift, shift);

+      qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);

-        zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC

-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);

-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);

-        round = _mm_unpackhi_epi64(round, round);

-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);

-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

-        quant = _mm_unpackhi_epi64(quant, quant);

-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);

-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);

-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

-        shift = _mm_unpackhi_epi64(shift, shift);

-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

+      // Reinsert signs

+      qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);

+      qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);

+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-        // Reinsert signs

-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);

-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);

-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

+      // Mask out zbin threshold coeffs

+      qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

+      qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

-        // Mask out zbin threshold coeffs

-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

+      store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);

+      store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);

-        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);

-        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);

+      coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

+      dequant = _mm_unpackhi_epi64(dequant, dequant);

+      coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

-        dequant = _mm_unpackhi_epi64(dequant, dequant);

-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

+      store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);

+      store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);

+    }

-        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);

-        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);

-      }

-      {

-        // Scan for eob

-        __m128i zero_coeff0, zero_coeff1;

-        __m128i nzero_coeff0, nzero_coeff1;

-        __m128i iscan0, iscan1;

-        __m128i eob1;

-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));

-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);

-        // Add one to convert from indices to counts

-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

-        eob = _mm_and_si128(iscan0, nzero_coeff0);

-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);

-        eob = _mm_max_epi16(eob, eob1);

-      }

-      n_coeffs += 8 * 2;

+    {

+      // Scan for eob

+      __m128i zero_coeff0, zero_coeff1;

+      __m128i nzero_coeff0, nzero_coeff1;

+      __m128i iscan0, iscan1;

+      __m128i eob1;

+      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

+      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

+      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

+      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

+      iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));

+      iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);

+      // Add one to convert from indices to counts

+      iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

+      iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

+      eob = _mm_and_si128(iscan0, nzero_coeff0);

+      eob1 = _mm_and_si128(iscan1, nzero_coeff1);

+      eob = _mm_max_epi16(eob, eob1);

+    n_coeffs += 8 * 2;

+  }

-    // AC only loop

-    while (n_coeffs < 0) {

-      __m128i coeff0, coeff1;

-      {

-        __m128i coeff0_sign, coeff1_sign;

-        __m128i qcoeff0, qcoeff1;

-        __m128i qtmp0, qtmp1;

-        __m128i cmp_mask0, cmp_mask1;

+  // AC only loop

+  while (n_coeffs < 0) {

+    __m128i coeff0, coeff1;

+    {

+      __m128i coeff0_sign, coeff1_sign;

+      __m128i qcoeff0, qcoeff1;

+      __m128i qtmp0, qtmp1;

+      __m128i cmp_mask0, cmp_mask1;

-        coeff0 = load_tran_low(coeff_ptr + n_coeffs);

-        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);

+      coeff0 = load_tran_low(coeff_ptr + n_coeffs);

+      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);

-        // Poor man's sign extract

-        coeff0_sign = _mm_srai_epi16(coeff0, 15);

-        coeff1_sign = _mm_srai_epi16(coeff1, 15);

-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

+      // Poor man's sign extract

+      coeff0_sign = _mm_srai_epi16(coeff0, 15);

+      coeff1_sign = _mm_srai_epi16(coeff1, 15);

+      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);

+      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);

+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);

-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);

-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);

-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);

-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);

-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);

-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

+      cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);

+      cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);

+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);

+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);

+      qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);

+      qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);

+      qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);

+      qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);

+      qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);

+      qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);

-        // Reinsert signs

-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);

-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);

-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

+      // Reinsert signs

+      qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);

+      qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);

+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);

+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);

-        // Mask out zbin threshold coeffs

-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

+      // Mask out zbin threshold coeffs

+      qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);

+      qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);

-        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);

-        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);

+      store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);

+      store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);

-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

+      coeff0 = _mm_mullo_epi16(qcoeff0, dequant);

+      coeff1 = _mm_mullo_epi16(qcoeff1, dequant);

-        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);

-        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);

-      }

-      {

-        // Scan for eob

-        __m128i zero_coeff0, zero_coeff1;

-        __m128i nzero_coeff0, nzero_coeff1;

-        __m128i iscan0, iscan1;

-        __m128i eob0, eob1;

-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));

-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);

-        // Add one to convert from indices to counts

-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);

-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);

-        eob0 = _mm_max_epi16(eob0, eob1);

-        eob = _mm_max_epi16(eob, eob0);

-      }

-      n_coeffs += 8 * 2;

+      store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);

+      store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);

-    // Accumulate EOB

-      __m128i eob_shuffled;

-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);

-      eob = _mm_max_epi16(eob, eob_shuffled);

-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);

-      eob = _mm_max_epi16(eob, eob_shuffled);

-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);

-      eob = _mm_max_epi16(eob, eob_shuffled);

-      *eob_ptr = _mm_extract_epi16(eob, 1);

+      // Scan for eob

+      __m128i zero_coeff0, zero_coeff1;

+      __m128i nzero_coeff0, nzero_coeff1;

+      __m128i iscan0, iscan1;

+      __m128i eob0, eob1;

+      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);

+      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);

+      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);

+      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);

+      iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));

+      iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);

+      // Add one to convert from indices to counts

+      iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);

+      iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);

+      eob0 = _mm_and_si128(iscan0, nzero_coeff0);

+      eob1 = _mm_and_si128(iscan1, nzero_coeff1);

+      eob0 = _mm_max_epi16(eob0, eob1);

+      eob = _mm_max_epi16(eob, eob0);

-  } else {

-    do {

-      store_tran_low(zero, dqcoeff_ptr + n_coeffs);

-      store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8);

-      store_tran_low(zero, qcoeff_ptr + n_coeffs);

-      store_tran_low(zero, qcoeff_ptr + n_coeffs + 8);

-      n_coeffs += 8 * 2;

-    } while (n_coeffs < 0);

-    *eob_ptr = 0;

+    n_coeffs += 8 * 2;

+  }

+  // Accumulate EOB

+  {

+    __m128i eob_shuffled;

+    eob_shuffled = _mm_shuffle_epi32(eob, 0xe);

+    eob = _mm_max_epi16(eob, eob_shuffled);

+    eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);

+    eob = _mm_max_epi16(eob, eob_shuffled);

+    eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);

+    eob = _mm_max_epi16(eob, eob_shuffled);

+    *eob_ptr = _mm_extract_epi16(eob, 1);

--- a/vpx_dsp/x86/quantize_ssse3.c

+++ b/vpx_dsp/x86/quantize_ssse3.c

@@ -8,6 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include <assert.h>

 #include <tmmintrin.h>

 #include "./vpx_dsp_rtcd.h"

@@ -28,18 +29,8 @@

   __m128i round, quant, dequant, shift;

   intptr_t index = 0;

   (void)scan_ptr;

-  if (skip_block) {

-    do {

-      store_tran_low(zero, dqcoeff_ptr + index);

-      store_tran_low(zero, dqcoeff_ptr + index + 8);

-      store_tran_low(zero, qcoeff_ptr + index);

-      store_tran_low(zero, qcoeff_ptr + index + 8);

-      index += 16;

-    } while (index < n_coeffs);

-    *eob_ptr = 0;

-    return;

-  }

+  (void)skip_block;

+  assert(!skip_block);

   // Setup global values

--- a/vpx_dsp/x86/quantize_ssse3_x86_64.asm

+++ b/vpx_dsp/x86/quantize_ssse3_x86_64.asm

@@ -19,9 +19,6 @@

 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \

                                 shift, qcoeff, dqcoeff, dequant, \

                                 eob, scan, iscan

-  cmp                    dword skipm, 0

-  jne .blank

   ; actual quantize loop - setup pointers, rounders, etc.

   movifnidn                   coeffq, coeffmp

   movifnidn                  ncoeffq, ncoeffmp

@@ -299,46 +296,6 @@

   pmaxsw                          m8, m7

   pextrw                          r6, m8, 0

   mov                             [r2], r6

-  RET

-  ; skip-block, i.e. just write all zeroes

-.blank:

-DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \

-            qcoeff, dqcoeff, dequant, eob, scan, iscan

-  mov                             r0, dqcoeffmp

-  movifnidn                  ncoeffq, ncoeffmp

-  mov                             r2, qcoeffmp

-  mov                             r3, eobmp

-  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob

-%if CONFIG_VP9_HIGHBITDEPTH

-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]

-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]

-%else

-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]

-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]

-%endif

-  neg                        ncoeffq

-  pxor                            m7, m7

-.blank_loop:

-%if CONFIG_VP9_HIGHBITDEPTH

-  mova       [dqcoeffq+ncoeffq*4+ 0], m7

-  mova       [dqcoeffq+ncoeffq*4+16], m7

-  mova       [dqcoeffq+ncoeffq*4+32], m7

-  mova       [dqcoeffq+ncoeffq*4+48], m7

-  mova        [qcoeffq+ncoeffq*4+ 0], m7

-  mova        [qcoeffq+ncoeffq*4+16], m7

-  mova        [qcoeffq+ncoeffq*4+32], m7

-  mova        [qcoeffq+ncoeffq*4+48], m7

-%else

-  mova       [dqcoeffq+ncoeffq*2+ 0], m7

-  mova       [dqcoeffq+ncoeffq*2+16], m7

-  mova        [qcoeffq+ncoeffq*2+ 0], m7

-  mova        [qcoeffq+ncoeffq*2+16], m7

-%endif

-  add                        ncoeffq, mmsize

-  jl .blank_loop

-  mov                    word [eobq], 0

RET

 %endmacro