shithub: libvpx

--- a/vp9/common/vp9_idct.c

+++ b/vp9/common/vp9_idct.c

@@ -864,10 +864,18 @@

-void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {

+void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest,

+                                 int dest_stride) {

+  int i, j;

+  int a1;

   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

   out = dct_const_round_shift(out * cospi_16_64);

-  output[0] = ROUND_POWER_OF_TWO(out, 6);

+  a1 = ROUND_POWER_OF_TWO(out, 6);

+  for (j = 0; j < 16; ++j) {

+    for (i = 0; i < 16; ++i)

+      dest[i] = clip_pixel(dest[i] + a1);

+    dest += dest_stride;

+  }

 static void idct32_1d(int16_t *input, int16_t *output) {

--- a/vp9/common/vp9_rtcd_defs.sh

+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -306,14 +306,14 @@

 prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"

 specialize vp9_short_idct10_8x8_add sse2

+prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_short_idct16x16_1_add sse2

 prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"

 specialize vp9_short_idct16x16_add sse2

 prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"

 specialize vp9_short_idct10_16x16_add sse2

-prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"

-specialize vp9_short_idct1_16x16

 prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"

 specialize vp9_short_idct32x32_add sse2

--- a/vp9/common/x86/vp9_idct_intrin_sse2.c

+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c

@@ -1470,6 +1470,38 @@

+void vp9_short_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {

+  __m128i dc_value;

+  const __m128i zero = _mm_setzero_si128();

+  int a, i;

+  a = dct_const_round_shift(input[0] * cospi_16_64);

+  a = dct_const_round_shift(a * cospi_16_64);

+  a = ROUND_POWER_OF_TWO(a, 6);

+  dc_value = _mm_set1_epi16(a);

+  for (i = 0; i < 2; ++i) {

+    RECON_AND_STORE(dest, dc_value);

+    RECON_AND_STORE(dest, dc_value);

+    RECON_AND_STORE(dest, dc_value);

+    RECON_AND_STORE(dest, dc_value);

+    RECON_AND_STORE(dest, dc_value);

+    RECON_AND_STORE(dest, dc_value);

+    RECON_AND_STORE(dest, dc_value);

+    RECON_AND_STORE(dest, dc_value);

+    RECON_AND_STORE(dest, dc_value);

+    RECON_AND_STORE(dest, dc_value);

+    RECON_AND_STORE(dest, dc_value);

+    RECON_AND_STORE(dest, dc_value);

+    RECON_AND_STORE(dest, dc_value);

+    RECON_AND_STORE(dest, dc_value);

+    RECON_AND_STORE(dest, dc_value);

+    RECON_AND_STORE(dest, dc_value);

+    dest += 8 - (stride * 16);

+  }

+}

 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {

   __m128i tbuf[8];

   array_transpose_8x8(res0, res0);

--- a/vp9/decoder/vp9_idct_blk.c

+++ b/vp9/decoder/vp9_idct_blk.c

@@ -123,14 +123,8 @@

   if (eob) {

     if (eob == 1) {

       /* DC only DCT coefficient. */

-      int16_t in = input[0];

-      int16_t out;

-      /* Note: the idct1 will need to be modified accordingly whenever

-       * vp9_short_idct16x16() is modified. */

-      vp9_short_idct1_16x16_c(&in, &out);

+      vp9_short_idct16x16_1_add(input, dest, stride);

       input[0] = 0;

-      vp9_add_constant_residual_16x16(out, dest, stride);

     } else if (eob <= 10) {

       vp9_short_idct10_16x16_add(input, dest, stride);

       vpx_memset(input, 0, 512);

--- a/vp9/encoder/vp9_encodemb.c

+++ b/vp9/encoder/vp9_encodemb.c

@@ -61,7 +61,9 @@

 static void inverse_transform_b_16x16_add(MACROBLOCKD *xd, int eob,

                                           int16_t *dqcoeff, uint8_t *dest,

                                           int stride) {

-  if (eob <= 10)

+  if (eob <= 1)

+    vp9_short_idct16x16_1_add(dqcoeff, dest, stride);

+  else if (eob <= 10)

     vp9_short_idct10_16x16_add(dqcoeff, dest, stride);

   else

     vp9_short_idct16x16_add(dqcoeff, dest, stride);