shithub: libvpx

--- a/test/dct16x16_test.cc

+++ b/test/dct16x16_test.cc

@@ -17,6 +17,7 @@

 extern "C" {

 #include "vp9/common/vp9_entropy.h"

 #include "vp9_rtcd.h"

+void vp9_short_idct16x16_add_c(short *input, uint8_t *output, int pitch);

 #include "acm_random.h"

@@ -269,19 +270,23 @@

   const int count_test_block = 1000;

   for (int i = 0; i < count_test_block; ++i) {

     int16_t in[256], coeff[256];

-    int16_t out_c[256];

+    uint8_t dst[256], src[256];

     double out_r[256];

+    for (int j = 0; j < 256; ++j) {

+      src[j] = rnd.Rand8();

+      dst[j] = rnd.Rand8();

+    }

     // Initialize a test block with input range [-255, 255].

     for (int j = 0; j < 256; ++j)

-      in[j] = rnd.Rand8() - rnd.Rand8();

+      in[j] = src[j] - dst[j];

     reference_16x16_dct_2d(in, out_r);

     for (int j = 0; j < 256; j++)

       coeff[j] = round(out_r[j]);

-    vp9_short_idct16x16_c(coeff, out_c, 32);

+    vp9_short_idct16x16_add_c(coeff, dst, 16);

     for (int j = 0; j < 256; ++j) {

-      const int diff = out_c[j] - in[j];

+      const int diff = dst[j] - src[j];

       const int error = diff * diff;

       EXPECT_GE(1, error)

           << "Error: 16x16 IDCT has error " << error

@@ -289,7 +294,7 @@

-#if 1

 // we need enable fdct test once we re-do the 16 point fdct.

 TEST(VP9Fdct16x16Test, AccuracyCheck) {

   ACMRandom rnd(ACMRandom::DeterministicSeed());

@@ -299,18 +304,22 @@

   for (int i = 0; i < count_test_block; ++i) {

     int16_t test_input_block[256];

     int16_t test_temp_block[256];

-    int16_t test_output_block[256];

+    uint8_t dst[256], src[256];

+    for (int j = 0; j < 256; ++j) {

+      src[j] = rnd.Rand8();

+      dst[j] = rnd.Rand8();

+    }

     // Initialize a test block with input range [-255, 255].

     for (int j = 0; j < 256; ++j)

-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();

+      test_input_block[j] = src[j] - dst[j];

     const int pitch = 32;

     vp9_short_fdct16x16_c(test_input_block, test_temp_block, pitch);

-    vp9_short_idct16x16_c(test_temp_block, test_output_block, pitch);

+    vp9_short_idct16x16_add_c(test_temp_block, dst, 16);

     for (int j = 0; j < 256; ++j) {

-      const int diff = test_input_block[j] - test_output_block[j];

+      const int diff = dst[j] - src[j];

       const int error = diff * diff;

       if (max_error < error)

         max_error = error;

@@ -354,6 +363,4 @@

-#endif

 }  // namespace

--- a/vp9/common/vp9_idct.c

+++ b/vp9/common/vp9_idct.c

@@ -621,10 +621,9 @@

   output[15] = step2[0] - step2[15];

-void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {

+void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

   int16_t out[16 * 16];

   int16_t *outptr = out;

-  const int half_pitch = pitch >> 1;

   int i, j;

   int16_t temp_in[16], temp_out[16];

@@ -641,7 +640,8 @@

       temp_in[j] = out[j * 16 + i];

     idct16_1d(temp_in, temp_out);

     for (j = 0; j < 16; ++j)

-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

+                                  + dest[j * dest_stride + i]);

@@ -823,8 +823,8 @@

   { iadst16_1d, iadst16_1d }   // ADST_ADST = 3

};

-void vp9_short_iht16x16_c(int16_t *input, int16_t *output,

-                          int pitch, int tx_type) {

+void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,

+                              int tx_type) {

   int i, j;

   int16_t out[16 * 16];

   int16_t *outptr = out;

@@ -844,37 +844,37 @@

       temp_in[j] = out[j * 16 + i];

     ht.cols(temp_in, temp_out);

     for (j = 0; j < 16; ++j)

-      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

-  }

+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

+                                  + dest[j * dest_stride + i]);  }

-void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {

-    int16_t out[16 * 16];

-    int16_t *outptr = out;

-    const int half_pitch = pitch >> 1;

-    int i, j;

-    int16_t temp_in[16], temp_out[16];

+void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,

+                                  int dest_stride) {

+  int16_t out[16 * 16];

+  int16_t *outptr = out;

+  int i, j;

+  int16_t temp_in[16], temp_out[16];

-    /* First transform rows. Since all non-zero dct coefficients are in

-     * upper-left 4x4 area, we only need to calculate first 4 rows here.

-     */

-    vpx_memset(out, 0, sizeof(out));

-    for (i = 0; i < 4; ++i) {

-      idct16_1d(input, outptr);

-      input += 16;

-      outptr += 16;

-    }

+  /* First transform rows. Since all non-zero dct coefficients are in

+   * upper-left 4x4 area, we only need to calculate first 4 rows here.

+   */

+  vpx_memset(out, 0, sizeof(out));

+  for (i = 0; i < 4; ++i) {

+    idct16_1d(input, outptr);

+    input += 16;

+    outptr += 16;

+  }

-    // Then transform columns

-    for (i = 0; i < 16; ++i) {

-      for (j = 0; j < 16; ++j)

-        temp_in[j] = out[j*16 + i];

-      idct16_1d(temp_in, temp_out);

-      for (j = 0; j < 16; ++j)

-        output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

-    }

+  // Then transform columns

+  for (i = 0; i < 16; ++i) {

+    for (j = 0; j < 16; ++j)

+      temp_in[j] = out[j*16 + i];

+    idct16_1d(temp_in, temp_out);

+    for (j = 0; j < 16; ++j)

+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

+                                  + dest[j * dest_stride + i]);

+  }

 void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {

   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

--- a/vp9/common/vp9_rtcd_defs.sh

+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -91,9 +91,6 @@

 prototype void vp9_add_residual_8x8 "const int16_t *diff, uint8_t *dest, int stride"

 specialize vp9_add_residual_8x8 sse2

-prototype void vp9_add_residual_16x16 "const int16_t *diff, uint8_t *dest, int stride"

-specialize vp9_add_residual_16x16 sse2

 prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"

 specialize vp9_add_constant_residual_8x8 sse2

@@ -200,11 +197,11 @@

 prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"

 specialize vp9_short_idct1_8x8

-prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct16x16 sse2

+prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_short_idct16x16_add sse2

-prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct10_16x16 sse2

+prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_short_idct10_16x16_add sse2

 prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"

 specialize vp9_short_idct1_16x16

@@ -224,8 +221,8 @@

 prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"

 specialize vp9_short_iht4x4

-prototype void vp9_short_iht16x16 "int16_t *input, int16_t *output, int pitch, int tx_type"

-specialize vp9_short_iht16x16

+prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"

+specialize vp9_short_iht16x16_add

 prototype void vp9_idct4_1d "int16_t *input, int16_t *output"

 specialize vp9_idct4_1d sse2

--- a/vp9/common/x86/vp9_idct_intrin_sse2.c

+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c

@@ -752,8 +752,17 @@

                            stp2_10, stp2_13, stp2_11, stp2_12) \

-void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) {

-  const int half_pitch = pitch >> 1;

+#define RECON_AND_STORE(dest, in_x) \

+  {                                                     \

+     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \

+      d0 = _mm_unpacklo_epi8(d0, zero); \

+      in_x = _mm_add_epi16(in_x, d0); \

+      in_x = _mm_packus_epi16(in_x, in_x); \

+      _mm_storel_epi64((__m128i *)(dest), in_x); \

+      dest += stride; \

+  }

+void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {

   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

   const __m128i final_rounding = _mm_set1_epi16(1<<5);

   const __m128i zero = _mm_setzero_si128();

@@ -938,31 +947,30 @@

       in14 = _mm_srai_epi16(in14, 6);

       in15 = _mm_srai_epi16(in15, 6);

-      // Store results

-      _mm_store_si128((__m128i *)output, in0);

-      _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);

-      _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);

-      _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);

-      _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);

-      _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);

-      _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);

-      _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);

-      _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);

-      _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);

-      _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);

-      _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);

-      _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);

-      _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);

-      _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);

-      _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);

+      RECON_AND_STORE(dest, in0);

+      RECON_AND_STORE(dest, in1);

+      RECON_AND_STORE(dest, in2);

+      RECON_AND_STORE(dest, in3);

+      RECON_AND_STORE(dest, in4);

+      RECON_AND_STORE(dest, in5);

+      RECON_AND_STORE(dest, in6);

+      RECON_AND_STORE(dest, in7);

+      RECON_AND_STORE(dest, in8);

+      RECON_AND_STORE(dest, in9);

+      RECON_AND_STORE(dest, in10);

+      RECON_AND_STORE(dest, in11);

+      RECON_AND_STORE(dest, in12);

+      RECON_AND_STORE(dest, in13);

+      RECON_AND_STORE(dest, in14);

+      RECON_AND_STORE(dest, in15);

-      output += 8;

+      dest += 8 - (stride * 16);

-void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {

-  const int half_pitch = pitch >> 1;

+void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,

+                                     int stride) {

   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

   const __m128i final_rounding = _mm_set1_epi16(1<<5);

   const __m128i zero = _mm_setzero_si128();

@@ -1007,7 +1015,6 @@

           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;

   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

   int i;

   // 1-D idct. Load input data.

   in0 = _mm_load_si128((__m128i *)input);

   in8 = _mm_load_si128((__m128i *)(input + 8 * 1));

@@ -1298,24 +1305,24 @@

     in14 = _mm_srai_epi16(in14, 6);

     in15 = _mm_srai_epi16(in15, 6);

-    // Store results

-    _mm_store_si128((__m128i *)output, in0);

-    _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);

-    _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);

-    _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);

-    _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);

-    _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);

-    _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);

-    _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);

-    _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);

-    _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);

-    _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);

-    _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);

-    _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);

-    _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);

-    _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);

-    _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);

-    output += 8;

+    RECON_AND_STORE(dest, in0);

+    RECON_AND_STORE(dest, in1);

+    RECON_AND_STORE(dest, in2);

+    RECON_AND_STORE(dest, in3);

+    RECON_AND_STORE(dest, in4);

+    RECON_AND_STORE(dest, in5);

+    RECON_AND_STORE(dest, in6);

+    RECON_AND_STORE(dest, in7);

+    RECON_AND_STORE(dest, in8);

+    RECON_AND_STORE(dest, in9);

+    RECON_AND_STORE(dest, in10);

+    RECON_AND_STORE(dest, in11);

+    RECON_AND_STORE(dest, in12);

+    RECON_AND_STORE(dest, in13);

+    RECON_AND_STORE(dest, in14);

+    RECON_AND_STORE(dest, in15);

+    dest += 8 - (stride * 16);

@@ -1933,16 +1940,6 @@

       in29 = _mm_srai_epi16(in29, 6);

       in30 = _mm_srai_epi16(in30, 6);

       in31 = _mm_srai_epi16(in31, 6);

-#define RECON_AND_STORE(dest, in_x) \

-  {                                                     \

-     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \

-      d0 = _mm_unpacklo_epi8(d0, zero); \

-      in_x = _mm_add_epi16(in_x, d0); \

-      in_x = _mm_packus_epi16(in_x, in_x); \

-      _mm_storel_epi64((__m128i *)(dest), in_x); \

-      dest += stride; \

-  }

       RECON_AND_STORE(dest, in0);

       RECON_AND_STORE(dest, in1);

--- a/vp9/decoder/vp9_idct_blk.c

+++ b/vp9/decoder/vp9_idct_blk.c

@@ -105,10 +105,6 @@

   add_residual(diff, dest, stride, 8, 8);

-void vp9_add_residual_16x16_c(const int16_t *diff, uint8_t *dest, int stride) {

-  add_residual(diff, dest, stride, 16, 16);

-}

 static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,

                                   int width, int height) {

   int r, c;

@@ -260,19 +256,14 @@

   if (tx_type == DCT_DCT) {

     vp9_idct_add_16x16(input, dest, stride, eob);

   } else {

-    DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);

     if (eob > 0) {

-      vp9_short_iht16x16(input, output, 16, tx_type);

+      vp9_short_iht16x16_add(input, dest, stride, tx_type);

       vpx_memset(input, 0, 512);

-      vp9_add_residual_16x16(output, dest, stride);

 void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) {

-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);

   /* The calculation can be simplified if there are not many non-zero dct

    * coefficients. Use eobs to separate different cases. */

   if (eob) {

@@ -288,21 +279,15 @@

       vp9_add_constant_residual_16x16(out, dest, stride);

 #if !CONFIG_SCATTERSCAN

     } else if (eob <= 10) {

-      // the idct halves ( >> 1) the pitch

-      vp9_short_idct10_16x16(input, output, 32);

+      vp9_short_idct10_16x16_add(input, dest, stride);

       input[0] = input[1] = input[2] = input[3] = 0;

       input[16] = input[17] = input[18] = 0;

       input[32] = input[33] = 0;

       input[48] = 0;

-      vp9_add_residual_16x16(output, dest, stride);

 #endif

     } else {

-      // the idct halves ( >> 1) the pitch

-      vp9_short_idct16x16(input, output, 16 << 1);

+      vp9_short_idct16x16_add(input, dest, stride);

       vpx_memset(input, 0, 512);

-      vp9_add_residual_16x16(output, dest, stride);

--- a/vp9/decoder/x86/vp9_dequantize_sse2.c

+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c

@@ -122,65 +122,6 @@

   _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);

-void vp9_add_residual_16x16_sse2(const int16_t *diff, uint8_t *dest,

-                                 int stride) {

-  const int width = 16;

-  int i = 4;

-  const __m128i zero = _mm_setzero_si128();

-  // Diff data

-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;

-  __m128i p0, p1, p2, p3, p4, p5, p6, p7;

-  do {

-    d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));

-    d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));

-    d2 = _mm_load_si128((const __m128i *)(diff + 1 * width));

-    d3 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));

-    d4 = _mm_load_si128((const __m128i *)(diff + 2 * width));

-    d5 = _mm_load_si128((const __m128i *)(diff + 2 * width + 8));

-    d6 = _mm_load_si128((const __m128i *)(diff + 3 * width));

-    d7 = _mm_load_si128((const __m128i *)(diff + 3 * width + 8));

-    // Prediction data.

-    p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride));

-    p3 = _mm_load_si128((const __m128i *)(dest + 1 * stride));

-    p5 = _mm_load_si128((const __m128i *)(dest + 2 * stride));

-    p7 = _mm_load_si128((const __m128i *)(dest + 3 * stride));

-    p0 = _mm_unpacklo_epi8(p1, zero);

-    p1 = _mm_unpackhi_epi8(p1, zero);

-    p2 = _mm_unpacklo_epi8(p3, zero);

-    p3 = _mm_unpackhi_epi8(p3, zero);

-    p4 = _mm_unpacklo_epi8(p5, zero);

-    p5 = _mm_unpackhi_epi8(p5, zero);

-    p6 = _mm_unpacklo_epi8(p7, zero);

-    p7 = _mm_unpackhi_epi8(p7, zero);

-    p0 = _mm_add_epi16(p0, d0);

-    p1 = _mm_add_epi16(p1, d1);

-    p2 = _mm_add_epi16(p2, d2);

-    p3 = _mm_add_epi16(p3, d3);

-    p4 = _mm_add_epi16(p4, d4);

-    p5 = _mm_add_epi16(p5, d5);

-    p6 = _mm_add_epi16(p6, d6);

-    p7 = _mm_add_epi16(p7, d7);

-    p0 = _mm_packus_epi16(p0, p1);

-    p1 = _mm_packus_epi16(p2, p3);

-    p2 = _mm_packus_epi16(p4, p5);

-    p3 = _mm_packus_epi16(p6, p7);

-    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);

-    _mm_store_si128((__m128i *)(dest + 1 * stride), p1);

-    _mm_store_si128((__m128i *)(dest + 2 * stride), p2);

-    _mm_store_si128((__m128i *)(dest + 3 * stride), p3);

-    diff += 4 * width;

-    dest += 4 * stride;

-  } while (--i);

-}

 void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,

                                         int stride) {

   uint8_t abs_diff;

--- a/vp9/encoder/vp9_encodemb.c

+++ b/vp9/encoder/vp9_encodemb.c

@@ -522,11 +522,12 @@

     case TX_16X16:

       tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;

       if (tx_type == DCT_DCT) {

-        vp9_short_idct16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),

-                            diff, bw * 2);

+        vp9_short_idct16x16_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,

+                                block, 16), dst, xd->plane[plane].dst.stride);

       } else {

-        vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),

-                           diff, bw, tx_type);

+        vp9_short_iht16x16_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,

+                               block, 16), dst, xd->plane[plane].dst.stride,

+                               tx_type);

       *wip_txfrm_size = 16;

       break;

@@ -605,7 +606,7 @@

   foreach_transformed_block_uv(xd, bsize, encode_block, &arg);

-  if (wip_txfrm_size < 32)

+  if (wip_txfrm_size < 16)

     vp9_recon_sbuv(xd, bsize);

@@ -627,13 +628,13 @@

   // wip version... will use foreach_transformed_block when done

   foreach_transformed_block_in_plane(xd, bsize, 0,

                                      encode_block, &arg);

-  if (wip_txfrm_size < 32)

+  if (wip_txfrm_size < 16)

     vp9_recon_sby(xd, bsize);

   wip_txfrm_size = 0;

   foreach_transformed_block_uv(xd, bsize, encode_block, &arg);

-  if (wip_txfrm_size < 32)

+  if (wip_txfrm_size < 16)

     vp9_recon_sbuv(xd, bsize);

 #endif