shithub: libvpx

--- a/test/dct32x32_test.cc

+++ b/test/dct32x32_test.cc

@@ -18,7 +18,7 @@

 #include "vp9/common/vp9_entropy.h"

 #include "./vp9_rtcd.h"

   void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);

-  void vp9_short_idct32x32_c(short *input, short *output, int pitch);

+  void vp9_short_idct32x32_add_c(short *input, uint8_t *output, int pitch);

 #include "test/acm_random.h"

@@ -91,28 +91,31 @@

 TEST(VP9Idct32x32Test, AccuracyCheck) {

   ACMRandom rnd(ACMRandom::DeterministicSeed());

   const int count_test_block = 1000;

   for (int i = 0; i < count_test_block; ++i) {

     int16_t in[1024], coeff[1024];

-    int16_t out_c[1024];

+    uint8_t dst[1024], src[1024];

     double out_r[1024];

+    for (int j = 0; j < 1024; ++j) {

+      src[j] = rnd.Rand8();

+      dst[j] = rnd.Rand8();

+    }

     // Initialize a test block with input range [-255, 255].

     for (int j = 0; j < 1024; ++j)

-      in[j] = rnd.Rand8() - rnd.Rand8();

+      in[j] = src[j] - dst[j];

     reference_32x32_dct_2d(in, out_r);

     for (int j = 0; j < 1024; j++)

       coeff[j] = round(out_r[j]);

-    vp9_short_idct32x32_c(coeff, out_c, 64);

+    vp9_short_idct32x32_add_c(coeff, dst, 32);

     for (int j = 0; j < 1024; ++j) {

-      const int diff = out_c[j] - in[j];

+      const int diff = dst[j] - src[j];

       const int error = diff * diff;

       EXPECT_GE(1, error)

-          << "Error: 3x32 IDCT has error " << error

+          << "Error: 32x32 IDCT has error " << error

           << " at index " << j;

@@ -126,18 +129,22 @@

   for (int i = 0; i < count_test_block; ++i) {

     int16_t test_input_block[1024];

     int16_t test_temp_block[1024];

-    int16_t test_output_block[1024];

+    uint8_t dst[1024], src[1024];

+    for (int j = 0; j < 1024; ++j) {

+      src[j] = rnd.Rand8();

+      dst[j] = rnd.Rand8();

+    }

     // Initialize a test block with input range [-255, 255].

     for (int j = 0; j < 1024; ++j)

-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();

+      test_input_block[j] = src[j] - dst[j];

     const int pitch = 64;

     vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);

-    vp9_short_idct32x32_c(test_temp_block, test_output_block, pitch);

+    vp9_short_idct32x32_add_c(test_temp_block, dst, 32);

     for (int j = 0; j < 1024; ++j) {

-      const unsigned diff = test_input_block[j] - test_output_block[j];

+      const unsigned diff = dst[j] - src[j];

       const unsigned error = diff * diff;

       if (max_error < error)

         max_error = error;

--- a/vp9/common/vp9_idct.c

+++ b/vp9/common/vp9_idct.c

@@ -1249,10 +1249,9 @@

   output[31] = step1[0] - step1[31];

-void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {

+void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

   int16_t out[32 * 32];

   int16_t *outptr = out;

-  const int half_pitch = pitch >> 1;

   int i, j;

   int16_t temp_in[32], temp_out[32];

@@ -1269,7 +1268,8 @@

       temp_in[j] = out[j * 32 + i];

     idct32_1d(temp_in, temp_out);

     for (j = 0; j < 32; ++j)

-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

+                                  + dest[j * dest_stride + i]);

@@ -1279,10 +1279,10 @@

   output[0] = ROUND_POWER_OF_TWO(out, 6);

-void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) {

+void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest,

+                                  int dest_stride) {

   int16_t out[32 * 32];

   int16_t *outptr = out;

-  const int half_pitch = pitch >> 1;

   int i, j;

   int16_t temp_in[32], temp_out[32];

@@ -1302,6 +1302,7 @@

       temp_in[j] = out[j * 32 + i];

     idct32_1d(temp_in, temp_out);

     for (j = 0; j < 32; ++j)

-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)

+                                  + dest[j * dest_stride + i]);

--- a/vp9/common/vp9_rtcd_defs.sh

+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -94,9 +94,6 @@

 prototype void vp9_add_residual_16x16 "const int16_t *diff, uint8_t *dest, int stride"

 specialize vp9_add_residual_16x16 sse2

-prototype void vp9_add_residual_32x32 "const int16_t *diff, uint8_t *dest, int stride"

-specialize vp9_add_residual_32x32 sse2

 prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"

 specialize vp9_add_constant_residual_8x8 sse2

@@ -212,15 +209,14 @@

 prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"

 specialize vp9_short_idct1_16x16

+prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_short_idct32x32_add sse2

-prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct32x32 sse2

 prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"

 specialize vp9_short_idct1_32x32

-prototype void vp9_short_idct10_32x32 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct10_32x32

+prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_short_idct10_32x32_add

 prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type"

 specialize vp9_short_iht8x8

--- a/vp9/common/x86/vp9_idct_intrin_sse2.c

+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c

@@ -1319,8 +1319,7 @@

-void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {

-  const int half_pitch = pitch >> 1;

+void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {

   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

   const __m128i final_rounding = _mm_set1_epi16(1<<5);

@@ -1832,6 +1831,8 @@

       col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);

       col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);

     } else {

+      const __m128i zero = _mm_setzero_si128();

       // 2_D: Calculate the results and store them to destination.

       in0 = _mm_add_epi16(stp1_0, stp1_31);

       in1 = _mm_add_epi16(stp1_1, stp1_30);

@@ -1933,41 +1934,50 @@

       in30 = _mm_srai_epi16(in30, 6);

       in31 = _mm_srai_epi16(in31, 6);

-      // Store results

-      _mm_store_si128((__m128i *)output, in0);

-      _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);

-      _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);

-      _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);

-      _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);

-      _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);

-      _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);

-      _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);

-      _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);

-      _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);

-      _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);

-      _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);

-      _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);

-      _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);

-      _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);

-      _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);

-      _mm_store_si128((__m128i *)(output + half_pitch * 16), in16);

-      _mm_store_si128((__m128i *)(output + half_pitch * 17), in17);

-      _mm_store_si128((__m128i *)(output + half_pitch * 18), in18);

-      _mm_store_si128((__m128i *)(output + half_pitch * 19), in19);

-      _mm_store_si128((__m128i *)(output + half_pitch * 20), in20);

-      _mm_store_si128((__m128i *)(output + half_pitch * 21), in21);

-      _mm_store_si128((__m128i *)(output + half_pitch * 22), in22);

-      _mm_store_si128((__m128i *)(output + half_pitch * 23), in23);

-      _mm_store_si128((__m128i *)(output + half_pitch * 24), in24);

-      _mm_store_si128((__m128i *)(output + half_pitch * 25), in25);

-      _mm_store_si128((__m128i *)(output + half_pitch * 26), in26);

-      _mm_store_si128((__m128i *)(output + half_pitch * 27), in27);

-      _mm_store_si128((__m128i *)(output + half_pitch * 28), in28);

-      _mm_store_si128((__m128i *)(output + half_pitch * 29), in29);

-      _mm_store_si128((__m128i *)(output + half_pitch * 30), in30);

-      _mm_store_si128((__m128i *)(output + half_pitch * 31), in31);

+#define RECON_AND_STORE(dest, in_x) \

+  {                                                     \

+     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \

+      d0 = _mm_unpacklo_epi8(d0, zero); \

+      in_x = _mm_add_epi16(in_x, d0); \

+      in_x = _mm_packus_epi16(in_x, in_x); \

+      _mm_storel_epi64((__m128i *)(dest), in_x); \

+      dest += stride; \

+  }

-      output += 8;

+      RECON_AND_STORE(dest, in0);

+      RECON_AND_STORE(dest, in1);

+      RECON_AND_STORE(dest, in2);

+      RECON_AND_STORE(dest, in3);

+      RECON_AND_STORE(dest, in4);

+      RECON_AND_STORE(dest, in5);

+      RECON_AND_STORE(dest, in6);

+      RECON_AND_STORE(dest, in7);

+      RECON_AND_STORE(dest, in8);

+      RECON_AND_STORE(dest, in9);

+      RECON_AND_STORE(dest, in10);

+      RECON_AND_STORE(dest, in11);

+      RECON_AND_STORE(dest, in12);

+      RECON_AND_STORE(dest, in13);

+      RECON_AND_STORE(dest, in14);

+      RECON_AND_STORE(dest, in15);

+      RECON_AND_STORE(dest, in16);

+      RECON_AND_STORE(dest, in17);

+      RECON_AND_STORE(dest, in18);

+      RECON_AND_STORE(dest, in19);

+      RECON_AND_STORE(dest, in20);

+      RECON_AND_STORE(dest, in21);

+      RECON_AND_STORE(dest, in22);

+      RECON_AND_STORE(dest, in23);

+      RECON_AND_STORE(dest, in24);

+      RECON_AND_STORE(dest, in25);

+      RECON_AND_STORE(dest, in26);

+      RECON_AND_STORE(dest, in27);

+      RECON_AND_STORE(dest, in28);

+      RECON_AND_STORE(dest, in29);

+      RECON_AND_STORE(dest, in30);

+      RECON_AND_STORE(dest, in31);

+      dest += 8 - (stride * 32);

--- a/vp9/decoder/vp9_idct_blk.c

+++ b/vp9/decoder/vp9_idct_blk.c

@@ -109,10 +109,6 @@

   add_residual(diff, dest, stride, 16, 16);

-void vp9_add_residual_32x32_c(const int16_t *diff, uint8_t *dest, int stride) {

-  add_residual(diff, dest, stride, 32, 32);

-}

 static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,

                                   int width, int height) {

   int r, c;

@@ -321,20 +317,16 @@

       input[0] = 0;

 #if !CONFIG_SCATTERSCAN

     } else if (eob <= 10) {

-      // the idct halves ( >> 1) the pitch

-      vp9_short_idct10_32x32(input, output, 64);

+      vp9_short_idct10_32x32_add_c(input, dest, stride);

       input[0] = input[1] = input[2] = input[3] = 0;

       input[32] = input[33] = input[34] = 0;

       input[64] = input[65] = 0;

       input[96] = 0;

-      vp9_add_residual_32x32(output, dest, stride);

 #endif

     } else {

-      vp9_short_idct32x32(input, output, 64);

+      vp9_short_idct32x32_add(input, dest, stride);

       vpx_memset(input, 0, 2048);

-      vp9_add_residual_32x32(output, dest, stride);

--- a/vp9/decoder/x86/vp9_dequantize_sse2.c

+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c

@@ -181,65 +181,6 @@

   } while (--i);

-void vp9_add_residual_32x32_sse2(const int16_t *diff, uint8_t *dest,

-                                 int stride) {

-  const int width = 32;

-  int i = 16;

-  const __m128i zero = _mm_setzero_si128();

-  // Diff data

-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;

-  __m128i p0, p1, p2, p3, p4, p5, p6, p7;

-  do {

-    d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));

-    d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));

-    d2 = _mm_load_si128((const __m128i *)(diff + 0 * width + 16));

-    d3 = _mm_load_si128((const __m128i *)(diff + 0 * width + 24));

-    d4 = _mm_load_si128((const __m128i *)(diff + 1 * width));

-    d5 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));

-    d6 = _mm_load_si128((const __m128i *)(diff + 1 * width + 16));

-    d7 = _mm_load_si128((const __m128i *)(diff + 1 * width + 24));

-    // Prediction data.

-    p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride));

-    p3 = _mm_load_si128((const __m128i *)(dest + 0 * stride + 16));

-    p5 = _mm_load_si128((const __m128i *)(dest + 1 * stride));

-    p7 = _mm_load_si128((const __m128i *)(dest + 1 * stride + 16));

-    p0 = _mm_unpacklo_epi8(p1, zero);

-    p1 = _mm_unpackhi_epi8(p1, zero);

-    p2 = _mm_unpacklo_epi8(p3, zero);

-    p3 = _mm_unpackhi_epi8(p3, zero);

-    p4 = _mm_unpacklo_epi8(p5, zero);

-    p5 = _mm_unpackhi_epi8(p5, zero);

-    p6 = _mm_unpacklo_epi8(p7, zero);

-    p7 = _mm_unpackhi_epi8(p7, zero);

-    p0 = _mm_add_epi16(p0, d0);

-    p1 = _mm_add_epi16(p1, d1);

-    p2 = _mm_add_epi16(p2, d2);

-    p3 = _mm_add_epi16(p3, d3);

-    p4 = _mm_add_epi16(p4, d4);

-    p5 = _mm_add_epi16(p5, d5);

-    p6 = _mm_add_epi16(p6, d6);

-    p7 = _mm_add_epi16(p7, d7);

-    p0 = _mm_packus_epi16(p0, p1);

-    p1 = _mm_packus_epi16(p2, p3);

-    p2 = _mm_packus_epi16(p4, p5);

-    p3 = _mm_packus_epi16(p6, p7);

-    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);

-    _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1);

-    _mm_store_si128((__m128i *)(dest + 1 * stride), p2);

-    _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3);

-    diff += 2 * width;

-    dest += 2 * stride;

-  } while (--i);

-}

 void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,

                                         int stride) {

   uint8_t abs_diff;

--- a/vp9/encoder/vp9_encodemb.c

+++ b/vp9/encoder/vp9_encodemb.c

@@ -425,6 +425,7 @@

   VP9_COMMON *cm;

   MACROBLOCK *x;

   struct optimize_ctx *ctx;

+  int *wip_txfrm_size;  // for "work in progress" only... will remove once done

};

 static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,

@@ -493,6 +494,7 @@

                          int ss_txfrm_size, void *arg) {

   struct encode_b_args* const args = arg;

   MACROBLOCK* const x = args->x;

+  int *wip_txfrm_size = args->wip_txfrm_size;

   MACROBLOCKD* const xd = &x->e_mbd;

   const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);

   const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,

@@ -500,6 +502,10 @@

   int16_t* const diff = raster_block_offset_int16(xd, bsize, plane,

                                                   raster_block,

                                                   xd->plane[plane].diff);

+  uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,

+                                                 raster_block,

+                                                 xd->plane[plane].dst.buf,

+                                                 xd->plane[plane].dst.stride);

   TX_TYPE tx_type = DCT_DCT;

   xform_quant(plane, block, bsize, ss_txfrm_size, arg);

@@ -509,8 +515,9 @@

   switch (ss_txfrm_size / 2) {

     case TX_32X32:

-      vp9_short_idct32x32(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),

-                          diff, bw * 2);

+        vp9_short_idct32x32_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,

+                                block, 16), dst, xd->plane[plane].dst.stride);

+        *wip_txfrm_size = 32;

       break;

     case TX_16X16:

       tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;

@@ -521,6 +528,7 @@

         vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),

                            diff, bw, tx_type);

+      *wip_txfrm_size = 16;

       break;

     case TX_8X8:

       tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;

@@ -531,6 +539,7 @@

         vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),

                          diff, bw, tx_type);

+      *wip_txfrm_size = 8;

       break;

     case TX_4X4:

       tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;

@@ -544,6 +553,7 @@

         vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),

                          diff, bw, tx_type);

+      *wip_txfrm_size = 4;

       break;

@@ -551,7 +561,7 @@

 void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,

                          BLOCK_SIZE_TYPE bsize) {

   MACROBLOCKD* const xd = &x->e_mbd;

-  struct encode_b_args arg = {cm, x, NULL};

+  struct encode_b_args arg = {cm, x, NULL, NULL};

   foreach_transformed_block_in_plane(xd, bsize, 0,

                                      xform_quant, &arg);

@@ -560,7 +570,7 @@

 void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,

                          BLOCK_SIZE_TYPE bsize) {

   MACROBLOCKD* const xd = &x->e_mbd;

-  struct encode_b_args arg = {cm, x, NULL};

+  struct encode_b_args arg = {cm, x, NULL, NULL};

   foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);

@@ -569,7 +579,8 @@

                     BLOCK_SIZE_TYPE bsize) {

   MACROBLOCKD* const xd = &x->e_mbd;

   struct optimize_ctx ctx;

-  struct encode_b_args arg = {cm, x, &ctx};

+  int wip_txfrm_size = 0;

+  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};

   vp9_subtract_sby(x, bsize);

   if (x->optimize)

@@ -577,8 +588,8 @@

   foreach_transformed_block_in_plane(xd, bsize, 0,

                                      encode_block, &arg);

-  vp9_recon_sby(xd, bsize);

+  if (wip_txfrm_size < 32)

+    vp9_recon_sby(xd, bsize);

 void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,

@@ -585,7 +596,8 @@

                      BLOCK_SIZE_TYPE bsize) {

   MACROBLOCKD* const xd = &x->e_mbd;

   struct optimize_ctx ctx;

-  struct encode_b_args arg = {cm, x, &ctx};

+  int wip_txfrm_size = 0;

+  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};

   vp9_subtract_sbuv(x, bsize);

   if (x->optimize)

@@ -593,7 +605,8 @@

   foreach_transformed_block_uv(xd, bsize, encode_block, &arg);

-  vp9_recon_sbuv(xd, bsize);

+  if (wip_txfrm_size < 32)

+    vp9_recon_sbuv(xd, bsize);

 void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,

@@ -600,13 +613,27 @@

                    BLOCK_SIZE_TYPE bsize) {

   MACROBLOCKD* const xd = &x->e_mbd;

   struct optimize_ctx ctx;

-  struct encode_b_args arg = {cm, x, &ctx};

+  int wip_txfrm_size = 0;

+  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};

   vp9_subtract_sb(x, bsize);

   if (x->optimize)

     vp9_optimize_init(xd, bsize, &ctx);

+#if 0

   foreach_transformed_block(xd, bsize, encode_block, &arg);

   vp9_recon_sb(xd, bsize);

+#else

+  // wip version... will use foreach_transformed_block when done

+  foreach_transformed_block_in_plane(xd, bsize, 0,

+                                     encode_block, &arg);

+  if (wip_txfrm_size < 32)

+    vp9_recon_sby(xd, bsize);

+  wip_txfrm_size = 0;

+  foreach_transformed_block_uv(xd, bsize, encode_block, &arg);

+  if (wip_txfrm_size < 32)

+    vp9_recon_sbuv(xd, bsize);

+#endif