ref: 5875d7a4a4b4a17dd891f1da0a30a220e4d9f798
parent: 9c6fafb25bd4eb22dec20cfd14ed56837662961b
parent: a7c4de22e106f005a21efd0e73f3e5ff31d8152e
author: Jingning Han <[email protected]>
date: Mon Jul 29 11:29:25 EDT 2013
Merge "16x16 inverse 2D-DCT with DC only"
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -864,10 +864,18 @@
}
}
-void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {
+void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int i, j;
+ int a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
- output[0] = ROUND_POWER_OF_TWO(out, 6);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+ for (j = 0; j < 16; ++j) {
+ for (i = 0; i < 16; ++i)
+ dest[i] = clip_pixel(dest[i] + a1);
+ dest += dest_stride;
+ }
}
static void idct32_1d(int16_t *input, int16_t *output) {
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -306,14 +306,14 @@
prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct10_8x8_add sse2
+prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct16x16_1_add sse2
+
prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct16x16_add sse2
prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct10_16x16_add sse2
-
-prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
-specialize vp9_short_idct1_16x16
prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct32x32_add sse2
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -1470,6 +1470,38 @@
}
}
+void vp9_short_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+ __m128i dc_value;
+ const __m128i zero = _mm_setzero_si128();
+ int a, i;
+
+ a = dct_const_round_shift(input[0] * cospi_16_64);
+ a = dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 6);
+
+ dc_value = _mm_set1_epi16(a);
+
+ for (i = 0; i < 2; ++i) {
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ dest += 8 - (stride * 16);
+ }
+}
+
static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
__m128i tbuf[8];
array_transpose_8x8(res0, res0);
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -123,14 +123,8 @@
if (eob) {
if (eob == 1) {
/* DC only DCT coefficient. */
- int16_t in = input[0];
- int16_t out;
- /* Note: the idct1 will need to be modified accordingly whenever
- * vp9_short_idct16x16() is modified. */
- vp9_short_idct1_16x16_c(&in, &out);
+ vp9_short_idct16x16_1_add(input, dest, stride);
input[0] = 0;
-
- vp9_add_constant_residual_16x16(out, dest, stride);
} else if (eob <= 10) {
vp9_short_idct10_16x16_add(input, dest, stride);
vpx_memset(input, 0, 512);
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -61,7 +61,9 @@
static void inverse_transform_b_16x16_add(MACROBLOCKD *xd, int eob,
int16_t *dqcoeff, uint8_t *dest,
int stride) {
- if (eob <= 10)
+ if (eob <= 1)
+ vp9_short_idct16x16_1_add(dqcoeff, dest, stride);
+ else if (eob <= 10)
vp9_short_idct10_16x16_add(dqcoeff, dest, stride);
else
vp9_short_idct16x16_add(dqcoeff, dest, stride);