ref: 9df24b41ca16353acb123acae7c70813cfffafdd
parent: b7cd01ed7375b1e5b6dc67f7427d07298f244471
parent: c8defcfdeea614a780af9a2405f59c60cab876ad
author: Ronald S. Bultje <[email protected]>
date: Tue Jul 2 05:38:08 EDT 2013
Merge "Update quantize SSSE3 SIMD to cover 32x32 transform case also."
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -569,6 +569,9 @@
prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
specialize vp9_quantize_b $ssse3_x86_64
+prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+specialize vp9_quantize_b_32x32 $ssse3_x86_64
+
#
# Structured Similarity (SSIM)
#
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -85,7 +85,7 @@
}
// This function works well for large transform size.
-static void quantize_sparse(int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
int16_t *zbin_ptr, int16_t *round_ptr,
int16_t *quant_ptr, int16_t *quant_shift_ptr,
@@ -92,11 +92,12 @@
int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
int16_t *dequant_ptr, int zbin_oq_value,
uint16_t *eob_ptr, const int16_t *scan,
- int *idx_arr) {
+ const int16_t *iscan) {
int i, rc, eob;
int zbins[2], nzbins[2], zbin;
int x, y, z, sz;
int idx = 0;
+ int idx_arr[1024];
vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
@@ -179,20 +180,18 @@
// Call different quantization for different transform size.
if (n_coeffs >= 1024) {
// Save index of picked coefficient in pre-scan pass.
- int idx_arr[1024];
-
- quantize_sparse(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
- n_coeffs, mb->skip_block,
- mb->plane[plane].zbin,
- mb->plane[plane].round,
- mb->plane[plane].quant,
- mb->plane[plane].quant_shift,
- BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
- BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
- xd->plane[plane].dequant,
- mb->plane[plane].zbin_extra,
- &xd->plane[plane].eobs[block],
- scan, idx_arr);
+ vp9_quantize_b_32x32(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
+ n_coeffs, mb->skip_block,
+ mb->plane[plane].zbin,
+ mb->plane[plane].round,
+ mb->plane[plane].quant,
+ mb->plane[plane].quant_shift,
+ BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
+ BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+ xd->plane[plane].dequant,
+ mb->plane[plane].zbin_extra,
+ &xd->plane[plane].eobs[block],
+ scan, iscan);
}
else {
vp9_quantize_b(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm
@@ -15,10 +15,10 @@
SECTION .text
-INIT_XMM ssse3
-cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
- shift, qcoeff, dqcoeff, dequant, zbin_oq, \
- eob, scan, iscan
+%macro QUANTIZE_FN 1
+cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
+ shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+ eob, scan, iscan
cmp dword skipm, 0
jne .blank
@@ -57,6 +57,10 @@
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
+%ifidn %1, b_32x32
+ paddw m6, m6
+ paddw m11, m11
+%endif
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
punpckhqdq m0, m0
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
@@ -77,9 +81,19 @@
pand m13, m12
mova [qcoeffq+ncoeffq*2+ 0], m8
mova [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+ pabsw m8, m8
+ pabsw m13, m13
+%endif
pmullw m8, m3 ; dqc[i] = qc[i] * q
punpckhqdq m3, m3
pmullw m13, m3 ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+ psrlw m8, 1
+ psrlw m13, 1
+ psignw m8, m9
+ psignw m13, m10
+%endif
mova [dqcoeffq+ncoeffq*2+ 0], m8
mova [dqcoeffq+ncoeffq*2+16], m13
pcmpeqw m8, m5 ; m8 = c[i] == 0
@@ -99,6 +113,10 @@
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
+%ifidn %1, b_32x32
+ paddw m6, m6
+ paddw m11, m11
+%endif
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
paddw m6, m1 ; m6 += round
@@ -115,8 +133,18 @@
pand m13, m12
mova [qcoeffq+ncoeffq*2+ 0], m14
mova [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+ pabsw m14, m14
+ pabsw m13, m13
+%endif
pmullw m14, m3 ; dqc[i] = qc[i] * q
pmullw m13, m3 ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+ psrlw m14, 1
+ psrlw m13, 1
+ psignw m14, m9
+ psignw m13, m10
+%endif
mova [dqcoeffq+ncoeffq*2+ 0], m14
mova [dqcoeffq+ncoeffq*2+16], m13
pcmpeqw m14, m5 ; m14 = c[i] == 0
@@ -163,3 +191,8 @@
jl .blank_loop
mov word [eobq], 0
RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FN b
+QUANTIZE_FN b_32x32