ref: 1ff94fea5639c1c7c4bc99a080aaa985d60d25b7
parent: 7e684e2009d3bc0cba769cb437401a3c7b7e80ed
author: Ronald S. Bultje <[email protected]>
date: Thu Jul 11 09:01:44 EDT 2013
Inline vp9_quantize() in xform_quant(). Cycle times: 4x4: 151 to 131 cycles (15% faster) 8x8: 334 to 306 cycles (9% faster) 16x16: 1401 to 1368 cycles (2.5% faster) 32x32: 7403 to 7367 cycles (0.5% faster) Total encode time of first 50 frames of bus @ 1500kbps (speed 0) goes from 1min39.2 to 1min38.6, i.e. a 0.67% overall speedup. Change-Id: I799a49460e5e3fcab01725564dd49c629bfe935f
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -432,48 +432,86 @@
struct encode_b_args* const args = arg;
MACROBLOCK* const x = args->x;
MACROBLOCKD* const xd = &x->e_mbd;
- const int bw = plane_block_width(bsize, &xd->plane[plane]);
- const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
- block, ss_txfrm_size);
- int16_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block, 16);
- int16_t *const src_diff = raster_block_offset_int16(xd, bsize, plane,
- raster_block,
- x->plane[plane].src_diff);
- TX_TYPE tx_type = DCT_DCT;
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ int16_t *coeff = BLOCK_OFFSET(p->coeff, block, 16);
+ int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block, 16);
+ int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
+ const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);
+ TX_TYPE tx_type;
+ const int16_t *scan, *iscan;
+ uint16_t *eob = &pd->eobs[block];
+ const int bwl = b_width_log2(bsize) - pd->subsampling_x, bw = 1 << bwl;
+ const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
+ int xoff, yoff;
+ int16_t *src_diff;
- switch (ss_txfrm_size / 2) {
+ switch (tx_size) {
case TX_32X32:
+ scan = vp9_default_scan_32x32;
+ iscan = vp9_default_iscan_32x32;
+ block >>= 6;
+ xoff = 32 * (block & twmask);
+ yoff = 32 * (block >> twl);
+ src_diff = p->src_diff + 4 * bw * yoff + xoff;
if (x->rd_search)
- vp9_short_fdct32x32_rd(src_diff, coeff, bw * 2);
+ vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
else
- vp9_short_fdct32x32(src_diff, coeff, bw * 2);
+ vp9_short_fdct32x32(src_diff, coeff, bw * 8);
+ vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob, scan, iscan);
break;
case TX_16X16:
tx_type = plane == 0 ? get_tx_type_16x16(xd) : DCT_DCT;
+ scan = get_scan_16x16(tx_type);
+ iscan = get_iscan_16x16(tx_type);
+ block >>= 4;
+ xoff = 16 * (block & twmask);
+ yoff = 16 * (block >> twl);
+ src_diff = p->src_diff + 4 * bw * yoff + xoff;
if (tx_type != DCT_DCT)
- vp9_short_fht16x16(src_diff, coeff, bw, tx_type);
+ vp9_short_fht16x16(src_diff, coeff, bw * 4, tx_type);
else
- x->fwd_txm16x16(src_diff, coeff, bw * 2);
+ x->fwd_txm16x16(src_diff, coeff, bw * 8);
+ vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob, scan, iscan);
break;
case TX_8X8:
tx_type = plane == 0 ? get_tx_type_8x8(xd) : DCT_DCT;
+ scan = get_scan_8x8(tx_type);
+ iscan = get_iscan_8x8(tx_type);
+ block >>= 2;
+ xoff = 8 * (block & twmask);
+ yoff = 8 * (block >> twl);
+ src_diff = p->src_diff + 4 * bw * yoff + xoff;
if (tx_type != DCT_DCT)
- vp9_short_fht8x8(src_diff, coeff, bw, tx_type);
+ vp9_short_fht8x8(src_diff, coeff, bw * 4, tx_type);
else
- x->fwd_txm8x8(src_diff, coeff, bw * 2);
+ x->fwd_txm8x8(src_diff, coeff, bw * 8);
+ vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob, scan, iscan);
break;
case TX_4X4:
- tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
+ tx_type = plane == 0 ? get_tx_type_4x4(xd, block) : DCT_DCT;
+ scan = get_scan_4x4(tx_type);
+ iscan = get_iscan_4x4(tx_type);
+ xoff = 4 * (block & twmask);
+ yoff = 4 * (block >> twl);
+ src_diff = p->src_diff + 4 * bw * yoff + xoff;
if (tx_type != DCT_DCT)
- vp9_short_fht4x4(src_diff, coeff, bw, tx_type);
+ vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type);
else
- x->fwd_txm4x4(src_diff, coeff, bw * 2);
+ x->fwd_txm4x4(src_diff, coeff, bw * 8);
+ vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, p->zbin_extra, eob, scan, iscan);
break;
default:
assert(0);
}
-
- vp9_quantize(x, plane, block, 16 << ss_txfrm_size, tx_type);
}
static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -152,63 +152,6 @@
*eob_ptr = eob + 1;
}
-void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
- TX_TYPE tx_type) {
- MACROBLOCKD *const xd = &mb->e_mbd;
- const int16_t *scan, *iscan;
-
- // These contexts may be available in the caller
- switch (n_coeffs) {
- case 4 * 4:
- scan = get_scan_4x4(tx_type);
- iscan = get_iscan_4x4(tx_type);
- break;
- case 8 * 8:
- scan = get_scan_8x8(tx_type);
- iscan = get_iscan_8x8(tx_type);
- break;
- case 16 * 16:
- scan = get_scan_16x16(tx_type);
- iscan = get_iscan_16x16(tx_type);
- break;
- default:
- scan = vp9_default_scan_32x32;
- iscan = vp9_default_iscan_32x32;
- break;
- }
-
- // Call different quantization for different transform size.
- if (n_coeffs >= 1024) {
- // Save index of picked coefficient in pre-scan pass.
- vp9_quantize_b_32x32(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
- n_coeffs, mb->skip_block,
- mb->plane[plane].zbin,
- mb->plane[plane].round,
- mb->plane[plane].quant,
- mb->plane[plane].quant_shift,
- BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
- BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
- xd->plane[plane].dequant,
- mb->plane[plane].zbin_extra,
- &xd->plane[plane].eobs[block],
- scan, iscan);
- }
- else {
- vp9_quantize_b(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
- n_coeffs, mb->skip_block,
- mb->plane[plane].zbin,
- mb->plane[plane].round,
- mb->plane[plane].quant,
- mb->plane[plane].quant_shift,
- BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
- BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
- xd->plane[plane].dequant,
- mb->plane[plane].zbin_extra,
- &xd->plane[plane].eobs[block],
- scan, iscan);
- }
-}
-
void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
int y_blocks) {
MACROBLOCKD *const xd = &mb->e_mbd;
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -22,9 +22,6 @@
#define prototype_quantize_mb(sym) \
void (sym)(MACROBLOCK *x)
-void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coefs,
- TX_TYPE tx_type);
-
void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2,
int y_blocks);
void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,