ref: cf4447339e1a3c615b97ebfe04ba80e54f568853
parent: a4364e5146a0612e0ac604cb6a8d636ae6b325fc
parent: eed1badeddcbc79e6c232194cce513774cb25a30
author: Jingning Han <[email protected]>
date: Wed Apr 1 10:55:18 EDT 2015
Merge "Optimize quantization simd implementation"
--- a/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -293,7 +293,8 @@
if (!skip_block) {
__m128i eob;
- __m128i round, quant, dequant;
+ __m128i round, quant, dequant, thr;
+ int16_t nzflag;
{
__m128i coeff0, coeff1;
@@ -368,6 +369,7 @@
// AC only loop
index = 2;
+ thr = _mm_srai_epi16(dequant, 1);
while (n_coeffs < 0) {
__m128i coeff0, coeff1;
{
@@ -387,28 +389,39 @@
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+ nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+ _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+ if (nzflag) {
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ } else {
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+ }
}
- {
+ if (nzflag) {
// Scan for eob
__m128i zero_coeff0, zero_coeff1;
__m128i nzero_coeff0, nzero_coeff1;
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -230,6 +230,8 @@
const int16_t* scan_ptr,
const int16_t* iscan_ptr) {
__m128i zero;
+ __m128i thr;
+ int16_t nzflag;
(void)scan_ptr;
(void)zbin_ptr;
(void)quant_shift_ptr;
@@ -316,6 +318,8 @@
n_coeffs += 8 * 2;
}
+ thr = _mm_srai_epi16(dequant, 1);
+
// AC only loop
while (n_coeffs < 0) {
__m128i coeff0, coeff1;
@@ -335,28 +339,39 @@
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+ nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+ _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+ if (nzflag) {
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ } else {
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+ }
}
- {
+ if (nzflag) {
// Scan for eob
__m128i zero_coeff0, zero_coeff1;
__m128i nzero_coeff0, nzero_coeff1;