ref: 2cf0d4be122f9951b34115401aad069a9464b4c5
parent: 1f26840fbfbef085ed53016bfc91705f148916fd
author: Scott LaVarnway <[email protected]>
date: Tue May 14 07:58:13 EDT 2013
WIP: 32x32 idct/recon merge This patch eliminates the intermediate diff buffer usage by combining the short idct and the add residual into one function. The encoder can use the same code as well. Change-Id: I4ea09df0e162591e420d869b7431c2e7f89a8c1a
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -18,7 +18,7 @@
#include "vp9/common/vp9_entropy.h"
#include "./vp9_rtcd.h"
void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);
- void vp9_short_idct32x32_c(short *input, short *output, int pitch);
+ void vp9_short_idct32x32_add_c(short *input, uint8_t *output, int pitch);
}
#include "test/acm_random.h"
@@ -91,28 +91,31 @@
}
}
-
TEST(VP9Idct32x32Test, AccuracyCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 1000;
for (int i = 0; i < count_test_block; ++i) {
int16_t in[1024], coeff[1024];
- int16_t out_c[1024];
+ uint8_t dst[1024], src[1024];
double out_r[1024];
+ for (int j = 0; j < 1024; ++j) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ }
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 1024; ++j)
- in[j] = rnd.Rand8() - rnd.Rand8();
+ in[j] = src[j] - dst[j];
reference_32x32_dct_2d(in, out_r);
for (int j = 0; j < 1024; j++)
coeff[j] = round(out_r[j]);
- vp9_short_idct32x32_c(coeff, out_c, 64);
+ vp9_short_idct32x32_add_c(coeff, dst, 32);
for (int j = 0; j < 1024; ++j) {
- const int diff = out_c[j] - in[j];
+ const int diff = dst[j] - src[j];
const int error = diff * diff;
EXPECT_GE(1, error)
- << "Error: 3x32 IDCT has error " << error
+ << "Error: 32x32 IDCT has error " << error
<< " at index " << j;
}
}
@@ -126,18 +129,22 @@
for (int i = 0; i < count_test_block; ++i) {
int16_t test_input_block[1024];
int16_t test_temp_block[1024];
- int16_t test_output_block[1024];
+ uint8_t dst[1024], src[1024];
+ for (int j = 0; j < 1024; ++j) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ }
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 1024; ++j)
- test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+ test_input_block[j] = src[j] - dst[j];
const int pitch = 64;
vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);
- vp9_short_idct32x32_c(test_temp_block, test_output_block, pitch);
+ vp9_short_idct32x32_add_c(test_temp_block, dst, 32);
for (int j = 0; j < 1024; ++j) {
- const unsigned diff = test_input_block[j] - test_output_block[j];
+ const unsigned diff = dst[j] - src[j];
const unsigned error = diff * diff;
if (max_error < error)
max_error = error;
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -1249,10 +1249,9 @@
output[31] = step1[0] - step1[31];
}
-void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
int16_t out[32 * 32];
int16_t *outptr = out;
- const int half_pitch = pitch >> 1;
int i, j;
int16_t temp_in[32], temp_out[32];
@@ -1269,7 +1268,8 @@
temp_in[j] = out[j * 32 + i];
idct32_1d(temp_in, temp_out);
for (j = 0; j < 32; ++j)
- output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+ dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * dest_stride + i]);
}
}
@@ -1279,10 +1279,10 @@
output[0] = ROUND_POWER_OF_TWO(out, 6);
}
-void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest,
+ int dest_stride) {
int16_t out[32 * 32];
int16_t *outptr = out;
- const int half_pitch = pitch >> 1;
int i, j;
int16_t temp_in[32], temp_out[32];
@@ -1302,6 +1302,7 @@
temp_in[j] = out[j * 32 + i];
idct32_1d(temp_in, temp_out);
for (j = 0; j < 32; ++j)
- output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+ dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * dest_stride + i]);
}
}
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -94,9 +94,6 @@
prototype void vp9_add_residual_16x16 "const int16_t *diff, uint8_t *dest, int stride"
specialize vp9_add_residual_16x16 sse2
-prototype void vp9_add_residual_32x32 "const int16_t *diff, uint8_t *dest, int stride"
-specialize vp9_add_residual_32x32 sse2
-
prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
specialize vp9_add_constant_residual_8x8 sse2
@@ -212,15 +209,14 @@
prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_16x16
+prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct32x32_add sse2
-prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct32x32 sse2
-
prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_32x32
-prototype void vp9_short_idct10_32x32 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct10_32x32
+prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct10_32x32_add
prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type"
specialize vp9_short_iht8x8
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -1319,8 +1319,7 @@
}
}
-void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {
- const int half_pitch = pitch >> 1;
+void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
@@ -1832,6 +1831,8 @@
col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
} else {
+ const __m128i zero = _mm_setzero_si128();
+
// 2_D: Calculate the results and store them to destination.
in0 = _mm_add_epi16(stp1_0, stp1_31);
in1 = _mm_add_epi16(stp1_1, stp1_30);
@@ -1933,41 +1934,50 @@
in30 = _mm_srai_epi16(in30, 6);
in31 = _mm_srai_epi16(in31, 6);
- // Store results
- _mm_store_si128((__m128i *)output, in0);
- _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
- _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
- _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
- _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
- _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
- _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
- _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
- _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
- _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
- _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
- _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
- _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
- _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
- _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
- _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
- _mm_store_si128((__m128i *)(output + half_pitch * 16), in16);
- _mm_store_si128((__m128i *)(output + half_pitch * 17), in17);
- _mm_store_si128((__m128i *)(output + half_pitch * 18), in18);
- _mm_store_si128((__m128i *)(output + half_pitch * 19), in19);
- _mm_store_si128((__m128i *)(output + half_pitch * 20), in20);
- _mm_store_si128((__m128i *)(output + half_pitch * 21), in21);
- _mm_store_si128((__m128i *)(output + half_pitch * 22), in22);
- _mm_store_si128((__m128i *)(output + half_pitch * 23), in23);
- _mm_store_si128((__m128i *)(output + half_pitch * 24), in24);
- _mm_store_si128((__m128i *)(output + half_pitch * 25), in25);
- _mm_store_si128((__m128i *)(output + half_pitch * 26), in26);
- _mm_store_si128((__m128i *)(output + half_pitch * 27), in27);
- _mm_store_si128((__m128i *)(output + half_pitch * 28), in28);
- _mm_store_si128((__m128i *)(output + half_pitch * 29), in29);
- _mm_store_si128((__m128i *)(output + half_pitch * 30), in30);
- _mm_store_si128((__m128i *)(output + half_pitch * 31), in31);
+#define RECON_AND_STORE(dest, in_x) \
+ { \
+ __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+ d0 = _mm_unpacklo_epi8(d0, zero); \
+ in_x = _mm_add_epi16(in_x, d0); \
+ in_x = _mm_packus_epi16(in_x, in_x); \
+ _mm_storel_epi64((__m128i *)(dest), in_x); \
+ dest += stride; \
+ }
- output += 8;
+ RECON_AND_STORE(dest, in0);
+ RECON_AND_STORE(dest, in1);
+ RECON_AND_STORE(dest, in2);
+ RECON_AND_STORE(dest, in3);
+ RECON_AND_STORE(dest, in4);
+ RECON_AND_STORE(dest, in5);
+ RECON_AND_STORE(dest, in6);
+ RECON_AND_STORE(dest, in7);
+ RECON_AND_STORE(dest, in8);
+ RECON_AND_STORE(dest, in9);
+ RECON_AND_STORE(dest, in10);
+ RECON_AND_STORE(dest, in11);
+ RECON_AND_STORE(dest, in12);
+ RECON_AND_STORE(dest, in13);
+ RECON_AND_STORE(dest, in14);
+ RECON_AND_STORE(dest, in15);
+ RECON_AND_STORE(dest, in16);
+ RECON_AND_STORE(dest, in17);
+ RECON_AND_STORE(dest, in18);
+ RECON_AND_STORE(dest, in19);
+ RECON_AND_STORE(dest, in20);
+ RECON_AND_STORE(dest, in21);
+ RECON_AND_STORE(dest, in22);
+ RECON_AND_STORE(dest, in23);
+ RECON_AND_STORE(dest, in24);
+ RECON_AND_STORE(dest, in25);
+ RECON_AND_STORE(dest, in26);
+ RECON_AND_STORE(dest, in27);
+ RECON_AND_STORE(dest, in28);
+ RECON_AND_STORE(dest, in29);
+ RECON_AND_STORE(dest, in30);
+ RECON_AND_STORE(dest, in31);
+
+ dest += 8 - (stride * 32);
}
}
}
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -109,10 +109,6 @@
add_residual(diff, dest, stride, 16, 16);
}
-void vp9_add_residual_32x32_c(const int16_t *diff, uint8_t *dest, int stride) {
- add_residual(diff, dest, stride, 32, 32);
-}
-
static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
int width, int height) {
int r, c;
@@ -321,20 +317,16 @@
input[0] = 0;
#if !CONFIG_SCATTERSCAN
} else if (eob <= 10) {
- // the idct halves ( >> 1) the pitch
- vp9_short_idct10_32x32(input, output, 64);
-
+ vp9_short_idct10_32x32_add_c(input, dest, stride);
input[0] = input[1] = input[2] = input[3] = 0;
input[32] = input[33] = input[34] = 0;
input[64] = input[65] = 0;
input[96] = 0;
- vp9_add_residual_32x32(output, dest, stride);
#endif
} else {
- vp9_short_idct32x32(input, output, 64);
+ vp9_short_idct32x32_add(input, dest, stride);
vpx_memset(input, 0, 2048);
- vp9_add_residual_32x32(output, dest, stride);
}
}
}
--- a/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c
@@ -181,65 +181,6 @@
} while (--i);
}
-void vp9_add_residual_32x32_sse2(const int16_t *diff, uint8_t *dest,
- int stride) {
- const int width = 32;
- int i = 16;
- const __m128i zero = _mm_setzero_si128();
-
- // Diff data
- __m128i d0, d1, d2, d3, d4, d5, d6, d7;
- __m128i p0, p1, p2, p3, p4, p5, p6, p7;
-
- do {
- d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
- d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));
- d2 = _mm_load_si128((const __m128i *)(diff + 0 * width + 16));
- d3 = _mm_load_si128((const __m128i *)(diff + 0 * width + 24));
- d4 = _mm_load_si128((const __m128i *)(diff + 1 * width));
- d5 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));
- d6 = _mm_load_si128((const __m128i *)(diff + 1 * width + 16));
- d7 = _mm_load_si128((const __m128i *)(diff + 1 * width + 24));
-
- // Prediction data.
- p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
- p3 = _mm_load_si128((const __m128i *)(dest + 0 * stride + 16));
- p5 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
- p7 = _mm_load_si128((const __m128i *)(dest + 1 * stride + 16));
-
- p0 = _mm_unpacklo_epi8(p1, zero);
- p1 = _mm_unpackhi_epi8(p1, zero);
- p2 = _mm_unpacklo_epi8(p3, zero);
- p3 = _mm_unpackhi_epi8(p3, zero);
- p4 = _mm_unpacklo_epi8(p5, zero);
- p5 = _mm_unpackhi_epi8(p5, zero);
- p6 = _mm_unpacklo_epi8(p7, zero);
- p7 = _mm_unpackhi_epi8(p7, zero);
-
- p0 = _mm_add_epi16(p0, d0);
- p1 = _mm_add_epi16(p1, d1);
- p2 = _mm_add_epi16(p2, d2);
- p3 = _mm_add_epi16(p3, d3);
- p4 = _mm_add_epi16(p4, d4);
- p5 = _mm_add_epi16(p5, d5);
- p6 = _mm_add_epi16(p6, d6);
- p7 = _mm_add_epi16(p7, d7);
-
- p0 = _mm_packus_epi16(p0, p1);
- p1 = _mm_packus_epi16(p2, p3);
- p2 = _mm_packus_epi16(p4, p5);
- p3 = _mm_packus_epi16(p6, p7);
-
- _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
- _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1);
- _mm_store_si128((__m128i *)(dest + 1 * stride), p2);
- _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3);
-
- diff += 2 * width;
- dest += 2 * stride;
- } while (--i);
-}
-
void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
int stride) {
uint8_t abs_diff;
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -425,6 +425,7 @@
VP9_COMMON *cm;
MACROBLOCK *x;
struct optimize_ctx *ctx;
+ int *wip_txfrm_size; // for "work in progress" only... will remove once done
};
static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
@@ -493,6 +494,7 @@
int ss_txfrm_size, void *arg) {
struct encode_b_args* const args = arg;
MACROBLOCK* const x = args->x;
+ int *wip_txfrm_size = args->wip_txfrm_size;
MACROBLOCKD* const xd = &x->e_mbd;
const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
@@ -500,6 +502,10 @@
int16_t* const diff = raster_block_offset_int16(xd, bsize, plane,
raster_block,
xd->plane[plane].diff);
+ uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,
+ raster_block,
+ xd->plane[plane].dst.buf,
+ xd->plane[plane].dst.stride);
TX_TYPE tx_type = DCT_DCT;
xform_quant(plane, block, bsize, ss_txfrm_size, arg);
@@ -509,8 +515,9 @@
switch (ss_txfrm_size / 2) {
case TX_32X32:
- vp9_short_idct32x32(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
- diff, bw * 2);
+ vp9_short_idct32x32_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+ block, 16), dst, xd->plane[plane].dst.stride);
+ *wip_txfrm_size = 32;
break;
case TX_16X16:
tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
@@ -521,6 +528,7 @@
vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
diff, bw, tx_type);
}
+ *wip_txfrm_size = 16;
break;
case TX_8X8:
tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
@@ -531,6 +539,7 @@
vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
diff, bw, tx_type);
}
+ *wip_txfrm_size = 8;
break;
case TX_4X4:
tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
@@ -544,6 +553,7 @@
vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
diff, bw, tx_type);
}
+ *wip_txfrm_size = 4;
break;
}
}
@@ -551,7 +561,7 @@
void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
BLOCK_SIZE_TYPE bsize) {
MACROBLOCKD* const xd = &x->e_mbd;
- struct encode_b_args arg = {cm, x, NULL};
+ struct encode_b_args arg = {cm, x, NULL, NULL};
foreach_transformed_block_in_plane(xd, bsize, 0,
xform_quant, &arg);
@@ -560,7 +570,7 @@
void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
BLOCK_SIZE_TYPE bsize) {
MACROBLOCKD* const xd = &x->e_mbd;
- struct encode_b_args arg = {cm, x, NULL};
+ struct encode_b_args arg = {cm, x, NULL, NULL};
foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);
}
@@ -569,7 +579,8 @@
BLOCK_SIZE_TYPE bsize) {
MACROBLOCKD* const xd = &x->e_mbd;
struct optimize_ctx ctx;
- struct encode_b_args arg = {cm, x, &ctx};
+ int wip_txfrm_size = 0;
+ struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
vp9_subtract_sby(x, bsize);
if (x->optimize)
@@ -577,8 +588,8 @@
foreach_transformed_block_in_plane(xd, bsize, 0,
encode_block, &arg);
-
- vp9_recon_sby(xd, bsize);
+ if (wip_txfrm_size < 32)
+ vp9_recon_sby(xd, bsize);
}
void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
@@ -585,7 +596,8 @@
BLOCK_SIZE_TYPE bsize) {
MACROBLOCKD* const xd = &x->e_mbd;
struct optimize_ctx ctx;
- struct encode_b_args arg = {cm, x, &ctx};
+ int wip_txfrm_size = 0;
+ struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
vp9_subtract_sbuv(x, bsize);
if (x->optimize)
@@ -593,7 +605,8 @@
foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
- vp9_recon_sbuv(xd, bsize);
+ if (wip_txfrm_size < 32)
+ vp9_recon_sbuv(xd, bsize);
}
void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
@@ -600,13 +613,27 @@
BLOCK_SIZE_TYPE bsize) {
MACROBLOCKD* const xd = &x->e_mbd;
struct optimize_ctx ctx;
- struct encode_b_args arg = {cm, x, &ctx};
+ int wip_txfrm_size = 0;
+ struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
vp9_subtract_sb(x, bsize);
if (x->optimize)
vp9_optimize_init(xd, bsize, &ctx);
-
+#if 0
foreach_transformed_block(xd, bsize, encode_block, &arg);
vp9_recon_sb(xd, bsize);
+#else
+ // wip version... will use foreach_transformed_block when done
+ foreach_transformed_block_in_plane(xd, bsize, 0,
+ encode_block, &arg);
+ if (wip_txfrm_size < 32)
+ vp9_recon_sby(xd, bsize);
+ wip_txfrm_size = 0;
+
+ foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
+
+ if (wip_txfrm_size < 32)
+ vp9_recon_sbuv(xd, bsize);
+#endif
}