shithub: libvpx

Download patch

ref: d52c359d4379f5c7f885c3e346ba366fef49e189
parent: 4c0f283886dd7d9760ddfdbcd8450ebf9fe6c3df
parent: 0084e61d5f935e763c29a49094377d4ab64577e5
author: Jingning Han <[email protected]>
date: Tue Jun 25 09:17:05 EDT 2013

Merge "Tune the rounding operations in 8x8 ADST/DCT sse2"

--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -33,7 +33,13 @@
   vp9_short_idct8x8_add_c(out, dst, stride >> 1);
 }
 void fht8x8(int16_t *in, int16_t *out, uint8_t *dst, int stride, int tx_type) {
+  // TODO(jingning): need to refactor this to test both _c and _sse2 functions,
+  // when we have all inverse dct functions done sse2.
+#if HAVE_SSE2
+  vp9_short_fht8x8_sse2(in, out, stride >> 1, tx_type);
+#else
   vp9_short_fht8x8_c(in, out, stride >> 1, tx_type);
+#endif
 }
 void iht8x8_add(int16_t *in, int16_t *out, uint8_t *dst,
                 int stride, int tx_type) {
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -397,6 +397,24 @@
 
 // write 8x8 array
 static INLINE void write_buffer_8x8(int16_t *output, __m128i res[8]) {
+  __m128i sign0 = _mm_srai_epi16(res[0], 15);
+  __m128i sign1 = _mm_srai_epi16(res[1], 15);
+  __m128i sign2 = _mm_srai_epi16(res[2], 15);
+  __m128i sign3 = _mm_srai_epi16(res[3], 15);
+  __m128i sign4 = _mm_srai_epi16(res[4], 15);
+  __m128i sign5 = _mm_srai_epi16(res[5], 15);
+  __m128i sign6 = _mm_srai_epi16(res[6], 15);
+  __m128i sign7 = _mm_srai_epi16(res[7], 15);
+
+  res[0] = _mm_sub_epi16(res[0], sign0);
+  res[1] = _mm_sub_epi16(res[1], sign1);
+  res[2] = _mm_sub_epi16(res[2], sign2);
+  res[3] = _mm_sub_epi16(res[3], sign3);
+  res[4] = _mm_sub_epi16(res[4], sign4);
+  res[5] = _mm_sub_epi16(res[5], sign5);
+  res[6] = _mm_sub_epi16(res[6], sign6);
+  res[7] = _mm_sub_epi16(res[7], sign7);
+
   res[0] = _mm_srai_epi16(res[0], 1);
   res[1] = _mm_srai_epi16(res[1], 1);
   res[2] = _mm_srai_epi16(res[2], 1);