ref: 5fbc7a286b4d72883392fdbb10ec52bace662f66
parent: 356174583506fb6654a3de7264348fbbfb7ca62c
author: Johann <[email protected]>
date: Fri Nov 30 10:42:57 EST 2018
quantize 32x32: saturate dqcoeff on x86 This slows down low bitdepth builds but is necessary to obtain correct values. BUG=webm:1448 Change-Id: I4ca9145f576089bb8496fcfeedeb556dc8fe6574
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -496,7 +496,6 @@
#endif // HAVE_SSE2
#if HAVE_SSSE3
-#if CONFIG_VP9_HIGHBITDEPTH
#if ARCH_X86_64
INSTANTIATE_TEST_CASE_P(
SSSE3, VP9QuantizeTest,
@@ -521,35 +520,9 @@
false)));
#endif // ARCH_X86_64
-#else
-#if ARCH_X86_64
-INSTANTIATE_TEST_CASE_P(
- SSSE3, VP9QuantizeTest,
- ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
- VPX_BITS_8, 16, false),
- make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
- &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
- 16, true),
- make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
- &QuantFPWrapper<quantize_fp_32x32_nz_c>,
- VPX_BITS_8, 32, true)));
-
-#else
-INSTANTIATE_TEST_CASE_P(SSSE3, VP9QuantizeTest,
- ::testing::Values(make_tuple(&vpx_quantize_b_ssse3,
- &vpx_quantize_b_c,
- VPX_BITS_8, 16, false)));
-#endif // ARCH_X86_64
-// TODO(webm:1448): lowbd truncates results in C.
-INSTANTIATE_TEST_CASE_P(DISABLED_SSSE3, VP9QuantizeTest,
- ::testing::Values(make_tuple(
- &vpx_quantize_b_32x32_ssse3,
- &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, false)));
-#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_SSSE3
#if HAVE_AVX
-#if CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(AVX, VP9QuantizeTest,
::testing::Values(make_tuple(&vpx_quantize_b_avx,
&vpx_quantize_b_c,
@@ -557,17 +530,6 @@
make_tuple(&vpx_quantize_b_32x32_avx,
&vpx_quantize_b_32x32_c,
VPX_BITS_8, 32, false)));
-#else
-INSTANTIATE_TEST_CASE_P(AVX, VP9QuantizeTest,
- ::testing::Values(make_tuple(&vpx_quantize_b_avx,
- &vpx_quantize_b_c,
- VPX_BITS_8, 16, false)));
-// TODO(webm:1448): lowbd truncates results in C.
-INSTANTIATE_TEST_CASE_P(DISABLED_AVX, VP9QuantizeTest,
- ::testing::Values(make_tuple(&vpx_quantize_b_32x32_avx,
- &vpx_quantize_b_32x32_c,
- VPX_BITS_8, 32, false)));
-#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_AVX
#if ARCH_X86_64 && HAVE_AVX2
@@ -576,7 +538,7 @@
::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
&QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
16, true)));
-#endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_AVX2
// TODO(webm:1448): dqcoeff is not handled correctly in HBD builds.
#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -12,6 +12,7 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/quantize.h"
+#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_mem/vpx_mem.h"
void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
@@ -259,7 +260,15 @@
15;
qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+#if (ARCH_X86 || ARCH_X86_64) && !CONFIG_VP9_HIGHBITDEPTH
+ // When tran_low_t is only 16 bits dqcoeff can outrange it. Rather than
+ // truncating with a cast, saturate the value. This is easier to implement
+ // on x86 and preserves the sign of the value.
+ dqcoeff_ptr[rc] =
+ clamp(qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2, INT16_MIN, INT16_MAX);
+#else
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+#endif // ARCH_X86 && CONFIG_VP9_HIGHBITDEPTH
if (tmp) eob = idx_arr[i];
}
--- a/vpx_dsp/x86/quantize_ssse3.h
+++ b/vpx_dsp/x86/quantize_ssse3.h
@@ -24,7 +24,6 @@
// Un-sign to bias rounding like C.
const __m128i coeff = _mm_abs_epi16(qcoeff);
-#if CONFIG_VP9_HIGHBITDEPTH
const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff);
const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff);
@@ -40,17 +39,12 @@
dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0);
dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1);
+#if CONFIG_VP9_HIGHBITDEPTH
_mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
_mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
#else
- __m128i dqcoeff16 = _mm_mullo_epi16(coeff, dequant);
- (void)zero;
-
- dqcoeff16 = _mm_srli_epi16(dqcoeff16, 1);
-
- dqcoeff16 = _mm_sign_epi16(dqcoeff16, qcoeff);
-
- _mm_store_si128((__m128i *)(dqcoeff), dqcoeff16);
+ _mm_store_si128((__m128i *)(dqcoeff),
+ _mm_packs_epi32(dqcoeff32_0, dqcoeff32_1));
#endif // CONFIG_VP9_HIGHBITDEPTH
}