ref: 969957f9f2a124861145a0d18781b855e98caa54
parent: 9efc42f4f89eeb05aba384e9179281ece3be6429
author: Jingning Han <[email protected]>
date: Thu Jan 26 10:00:04 EST 2017
Fix real-time compression regression in hbd mode This commit resolves the compression performance regression in real-time encoding setting when high bit-depth mode is enabled. The current solution temporarily disables the SIMD implementations of vpx_satd, hadamard8x8, and hadamard16x16 in high bit-depth mode. The commit makes the coding results bit-wise identical between regular coding pipeline and high bit-depth at profile 0. BUG=webm:1365 Change-Id: Icfb900821733749685370460a1a5a7e07f76f4bf
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -315,11 +315,13 @@
::testing::Values(make_tuple(16, 16, 1, 8, &vpx_avg_8x8_c),
make_tuple(16, 16, 1, 4, &vpx_avg_4x4_c)));
+#if !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(C, SatdTest,
::testing::Values(make_tuple(16, &vpx_satd_c),
make_tuple(64, &vpx_satd_c),
make_tuple(256, &vpx_satd_c),
make_tuple(1024, &vpx_satd_c)));
+#endif
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
@@ -345,6 +347,7 @@
make_tuple(64, &vpx_int_pro_col_sse2,
&vpx_int_pro_col_c)));
+#if !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(SSE2, SatdTest,
::testing::Values(make_tuple(16, &vpx_satd_sse2),
make_tuple(64, &vpx_satd_sse2),
@@ -351,6 +354,7 @@
make_tuple(256, &vpx_satd_sse2),
make_tuple(1024, &vpx_satd_sse2)));
#endif
+#endif
#if HAVE_NEON
INSTANTIATE_TEST_CASE_P(
@@ -376,12 +380,14 @@
make_tuple(64, &vpx_int_pro_col_neon,
&vpx_int_pro_col_c)));
+#if !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(NEON, SatdTest,
::testing::Values(make_tuple(16, &vpx_satd_neon),
make_tuple(64, &vpx_satd_neon),
make_tuple(256, &vpx_satd_neon),
make_tuple(1024, &vpx_satd_neon)));
-#endif
+#endif // !CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_NEON
#if HAVE_MSA
INSTANTIATE_TEST_CASE_P(
@@ -407,11 +413,13 @@
make_tuple(64, &vpx_int_pro_col_msa,
&vpx_int_pro_col_c)));
+#if !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(MSA, SatdTest,
::testing::Values(make_tuple(16, &vpx_satd_msa),
make_tuple(64, &vpx_satd_msa),
make_tuple(256, &vpx_satd_msa),
make_tuple(1024, &vpx_satd_msa)));
-#endif
+#endif // !CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_MSA
} // namespace
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -144,6 +144,7 @@
}
}
+#if !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test,
::testing::Values(&vpx_hadamard_8x8_c));
@@ -166,6 +167,7 @@
INSTANTIATE_TEST_CASE_P(MSA, Hadamard8x8Test,
::testing::Values(&vpx_hadamard_8x8_msa));
#endif // HAVE_MSA
+#endif // !CONFIG_VP9_HIGHBITDEPTH
class Hadamard16x16Test : public HadamardTestBase {};
@@ -210,6 +212,7 @@
}
}
+#if !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test,
::testing::Values(&vpx_hadamard_16x16_c));
@@ -227,4 +230,5 @@
INSTANTIATE_TEST_CASE_P(MSA, Hadamard16x16Test,
::testing::Values(&vpx_hadamard_16x16_msa));
#endif // HAVE_MSA
+#endif // !CONFIG_VP9_HIGHBITDEPTH
} // namespace
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -132,6 +132,9 @@
add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
specialize qw/vp9_highbd_block_error_8bit sse2 avx/;
+ add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
+ specialize qw/vp9_block_error_fp/;
+
add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1815,7 +1815,9 @@
}
#if CONFIG_VP9_HIGHBITDEPTH
- {
+ // TODO(jingning): Implement integral projection functions for high bit-depth
+ // setting and remove this part of code.
+ if (xd->bd != 8) {
unsigned int this_sad;
tmp_mv->row = 0;
tmp_mv->col = 0;
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -590,25 +590,10 @@
*out_dist_sum += dist << 4;
}
-#if CONFIG_VP9_HIGHBITDEPTH
static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
int *skippable, int64_t *sse, BLOCK_SIZE bsize,
TX_SIZE tx_size) {
MACROBLOCKD *xd = &x->e_mbd;
- unsigned int var_y, sse_y;
-
- (void)tx_size;
- model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist, &var_y,
- &sse_y);
- *sse = INT_MAX;
- *skippable = 0;
- return;
-}
-#else
-static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
- int *skippable, int64_t *sse, BLOCK_SIZE bsize,
- TX_SIZE tx_size) {
- MACROBLOCKD *xd = &x->e_mbd;
const struct macroblockd_plane *pd = &xd->plane[0];
struct macroblock_plane *const p = &x->plane[0];
const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
@@ -624,6 +609,20 @@
const int bw = 4 * num_4x4_w;
const int bh = 4 * num_4x4_h;
+#if CONFIG_VP9_HIGHBITDEPTH
+ // TODO(jingning): Implement the high bit-depth Hadamard transforms and
+ // remove this check condition.
+ if (xd->bd != 8) {
+ unsigned int var_y, sse_y;
+ (void)tx_size;
+ model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist,
+ &var_y, &sse_y);
+ *sse = INT_MAX;
+ *skippable = 0;
+ return;
+ }
+#endif
+
(void)cpi;
// The max tx_size passed in is TX_16X16.
@@ -648,7 +647,7 @@
switch (tx_size) {
case TX_16X16:
- vpx_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
+ vpx_hadamard_16x16(src_diff, diff_stride, coeff);
vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, eob, scan_order->scan,
@@ -655,7 +654,7 @@
scan_order->iscan);
break;
case TX_8X8:
- vpx_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
+ vpx_hadamard_8x8(src_diff, diff_stride, coeff);
vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, eob, scan_order->scan,
@@ -699,7 +698,7 @@
if (*eob == 1)
this_rdc->rate += (int)abs(qcoeff[0]);
else if (*eob > 1)
- this_rdc->rate += vpx_satd((const int16_t *)qcoeff, step << 4);
+ this_rdc->rate += vpx_satd(qcoeff, step << 4);
this_rdc->dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> 2;
}
@@ -711,7 +710,6 @@
this_rdc->rate <<= (2 + VP9_PROB_COST_SHIFT);
this_rdc->rate += (eob_cost << VP9_PROB_COST_SHIFT);
}
-#endif
static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE plane_bsize,
MACROBLOCK *x, MACROBLOCKD *xd,
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -321,7 +321,7 @@
return error;
}
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
int block_size) {
int i;
int64_t error = 0;
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -67,9 +67,10 @@
// The order of the output coeff of the hadamard is not important. For
// optimization purposes the final transpose may be skipped.
void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
- int16_t *coeff) {
+ tran_low_t *coeff) {
int idx;
int16_t buffer[64];
+ int16_t buffer2[64];
int16_t *tmp_buf = &buffer[0];
for (idx = 0; idx < 8; ++idx) {
hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit
@@ -80,17 +81,19 @@
tmp_buf = &buffer[0];
for (idx = 0; idx < 8; ++idx) {
- hadamard_col8(tmp_buf, 8, coeff); // tmp_buf: 12 bit
- // dynamic range [-2040, 2040]
- coeff += 8; // coeff: 15 bit
- // dynamic range [-16320, 16320]
+ hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx); // tmp_buf: 12 bit
+ // dynamic range [-2040, 2040]
+ // buffer2: 15 bit
+ // dynamic range [-16320, 16320]
++tmp_buf;
}
+
+ for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
}
// In place 16x16 2D Hadamard transform
void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
- int16_t *coeff) {
+ tran_low_t *coeff) {
int idx;
for (idx = 0; idx < 4; ++idx) {
// src_diff: 9 bit, dynamic range [-255, 255]
@@ -101,15 +104,15 @@
// coeff: 15 bit, dynamic range [-16320, 16320]
for (idx = 0; idx < 64; ++idx) {
- int16_t a0 = coeff[0];
- int16_t a1 = coeff[64];
- int16_t a2 = coeff[128];
- int16_t a3 = coeff[192];
+ tran_low_t a0 = coeff[0];
+ tran_low_t a1 = coeff[64];
+ tran_low_t a2 = coeff[128];
+ tran_low_t a3 = coeff[192];
- int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640]
- int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range
- int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320]
- int16_t b3 = (a2 - a3) >> 1;
+ tran_low_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640]
+ tran_low_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range
+ tran_low_t b2 = (a2 + a3) >> 1; // [-16320, 16320]
+ tran_low_t b3 = (a2 - a3) >> 1;
coeff[0] = b0 + b2; // 16 bit, [-32640, 32640]
coeff[64] = b1 + b3;
@@ -122,7 +125,7 @@
// coeff: 16 bits, dynamic range [-32640, 32640].
// length: value range {16, 64, 256, 1024}.
-int vpx_satd_c(const int16_t *coeff, int length) {
+int vpx_satd_c(const tran_low_t *coeff, int length) {
int i;
int satd = 0;
for (i = 0; i < length; ++i) satd += abs(coeff[i]);
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -885,14 +885,26 @@
add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
specialize qw/vpx_minmax_8x8 sse2 neon msa/;
- add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
- specialize qw/vpx_hadamard_8x8 sse2 neon msa/, "$ssse3_x86_64";
- add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
- specialize qw/vpx_hadamard_16x16 sse2 neon msa/;
+ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
+ specialize qw/vpx_hadamard_8x8/;
- add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
- specialize qw/vpx_satd sse2 neon msa/;
+ add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
+ specialize qw/vpx_hadamard_16x16/;
+
+ add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
+ specialize qw/vpx_satd/;
+ } else {
+ add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
+ specialize qw/vpx_hadamard_8x8 sse2 neon msa/, "$ssse3_x86_64";
+
+ add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
+ specialize qw/vpx_hadamard_16x16 sse2 neon msa/;
+
+ add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
+ specialize qw/vpx_satd sse2 neon msa/;
+ }
add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
specialize qw/vpx_int_pro_row sse2 neon msa/;