shithub: libvpx

Download patch

ref: cf4447339e1a3c615b97ebfe04ba80e54f568853
parent: a4364e5146a0612e0ac604cb6a8d636ae6b325fc
parent: eed1badeddcbc79e6c232194cce513774cb25a30
author: Jingning Han <[email protected]>
date: Wed Apr 1 10:55:18 EDT 2015

Merge "Optimize quantization simd implementation"

--- a/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -293,7 +293,8 @@
 
   if (!skip_block) {
     __m128i eob;
-    __m128i round, quant, dequant;
+    __m128i round, quant, dequant, thr;
+    int16_t nzflag;
     {
       __m128i coeff0, coeff1;
 
@@ -368,6 +369,7 @@
 
     // AC only loop
     index = 2;
+    thr = _mm_srai_epi16(dequant, 1);
     while (n_coeffs < 0) {
       __m128i coeff0, coeff1;
       {
@@ -387,28 +389,39 @@
         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+            _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
 
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+        if (nzflag) {
+          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
 
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+          // Reinsert signs
+          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
 
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        } else {
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+        }
       }
 
-      {
+      if (nzflag) {
         // Scan for eob
         __m128i zero_coeff0, zero_coeff1;
         __m128i nzero_coeff0, nzero_coeff1;
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -230,6 +230,8 @@
                           const int16_t* scan_ptr,
                           const int16_t* iscan_ptr) {
   __m128i zero;
+  __m128i thr;
+  int16_t nzflag;
   (void)scan_ptr;
   (void)zbin_ptr;
   (void)quant_shift_ptr;
@@ -316,6 +318,8 @@
       n_coeffs += 8 * 2;
     }
 
+    thr = _mm_srai_epi16(dequant, 1);
+
     // AC only loop
     while (n_coeffs < 0) {
       __m128i coeff0, coeff1;
@@ -335,28 +339,39 @@
         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+            _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
 
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+        if (nzflag) {
+          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
 
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+          // Reinsert signs
+          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
 
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        } else {
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+        }
       }
 
-      {
+      if (nzflag) {
         // Scan for eob
         __m128i zero_coeff0, zero_coeff1;
         __m128i nzero_coeff0, nzero_coeff1;