ref: 325e0aa6505eb480f5a55e072e195cbc3db0aacf
parent: 52256cdbcaf07e637c964f92671dfc82321f2125
author: Jingning Han <[email protected]>
date: Fri Jul 26 10:11:37 EDT 2013
Special handle on DC only inverse 8x8 2D-DCT This commit enables a special handle for the 8x8 inverse 2D-DCT, where only DC coefficient is quantized to be non-zero. For bus_cif at 2000 kbps, it provides about 1% speed-up at speed 0. Change-Id: I2523222359eec26b144cf8fd4c63a4ad63b1b011
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -225,6 +225,19 @@
}
}
+void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+ int i, j;
+ int a1;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+ for (j = 0; j < 8; ++j) {
+ for (i = 0; i < 8; ++i)
+ dest[i] = clip_pixel(dest[i] + a1);
+ dest += dest_stride;
+ }
+}
+
static void iadst4_1d(int16_t *input, int16_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7;
@@ -431,12 +444,6 @@
dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ dest[j * dest_stride + i]);
}
-}
-
-void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output) {
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
- out = dct_const_round_shift(out * cospi_16_64);
- output[0] = ROUND_POWER_OF_TWO(out, 5);
}
static void idct16_1d(int16_t *input, int16_t *output) {
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -297,14 +297,14 @@
prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct4x4_add sse2
+prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct8x8_1_add sse2
+
prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct8x8_add sse2 neon
prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct10_8x8_add sse2
-
-prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
-specialize vp9_short_idct1_8x8
prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct16x16_add sse2
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -523,9 +523,9 @@
{ \
__m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
d0 = _mm_unpacklo_epi8(d0, zero); \
- in_x = _mm_add_epi16(in_x, d0); \
- in_x = _mm_packus_epi16(in_x, in_x); \
- _mm_storel_epi64((__m128i *)(dest), in_x); \
+ d0 = _mm_add_epi16(in_x, d0); \
+ d0 = _mm_packus_epi16(d0, d0); \
+ _mm_storel_epi64((__m128i *)(dest), d0); \
dest += stride; \
}
@@ -595,6 +595,27 @@
RECON_AND_STORE(dest, in5);
RECON_AND_STORE(dest, in6);
RECON_AND_STORE(dest, in7);
+}
+
+void vp9_short_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+ __m128i dc_value;
+ const __m128i zero = _mm_setzero_si128();
+ int a;
+
+ a = dct_const_round_shift(input[0] * cospi_16_64);
+ a = dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 5);
+
+ dc_value = _mm_set1_epi16(a);
+
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
}
// perform 8x8 transpose
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -93,15 +93,8 @@
if (eob) {
if (eob == 1) {
// DC only DCT coefficient
- int16_t in = input[0];
- int16_t out;
-
- // Note: the idct1 will need to be modified accordingly whenever
- // vp9_short_idct8x8_c() is modified.
- vp9_short_idct1_8x8_c(&in, &out);
+ vp9_short_idct8x8_1_add(input, dest, stride);
input[0] = 0;
-
- vp9_add_constant_residual_8x8(out, dest, stride);
} else {
vp9_short_idct8x8_add(input, dest, stride);
vpx_memset(input, 0, 128);
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -47,6 +47,14 @@
xd->inv_txm4x4_add(dqcoeff, dest, stride);
}
+static void inverse_transform_b_8x8_add(MACROBLOCKD *xd, int eob,
+ int16_t *dqcoeff, uint8_t *dest,
+ int stride) {
+ if (eob <= 1)
+ vp9_short_idct8x8_1_add(dqcoeff, dest, stride);
+ else
+ vp9_short_idct8x8_add(dqcoeff, dest, stride);
+}
static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) {
struct macroblock_plane *const p = &x->plane[plane];
@@ -533,7 +541,8 @@
vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);
break;
case TX_8X8:
- vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
+ inverse_transform_b_8x8_add(xd, pd->eobs[block], dqcoeff,
+ dst, pd->dst.stride);
break;
case TX_4X4:
// this is like vp9_short_idct4x4 but has a special case around eob<=1
@@ -711,7 +720,7 @@
pd->dequant, p->zbin_extra, eob, scan, iscan);
if (!x->skip_encode && *eob) {
if (tx_type == DCT_DCT)
- vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
+ inverse_transform_b_8x8_add(xd, *eob, dqcoeff, dst, pd->dst.stride);
else
vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type);
}
@@ -746,8 +755,7 @@
// this is like vp9_short_idct4x4 but has a special case around eob<=1
// which is significant (not just an optimization) for the lossless
// case.
- inverse_transform_b_4x4_add(xd, *eob, dqcoeff,
- dst, pd->dst.stride);
+ inverse_transform_b_4x4_add(xd, *eob, dqcoeff, dst, pd->dst.stride);
else
vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);
}