shithub: libvpx

Download patch

ref: e3f12b520fe2d963d4aecf18c30ebbb594d50899
parent: dff4d376eaff3460e930711807ce865ff76aff3a
author: Johann <[email protected]>
date: Mon Apr 29 09:05:30 EDT 2019

vp8 quantize: use native abs/sign implementations

~4% improvement with a very rudimentary speed test

Change-Id: Iad8868327e3276dbead783a79849295b0e4b135c

--- a/test/quantize_test.cc
+++ b/test/quantize_test.cc
@@ -13,9 +13,10 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
-#include "./vpx_config.h"
 #include "./vp8_rtcd.h"
+#include "./vpx_config.h"
 #include "test/acm_random.h"
+#include "test/bench.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
@@ -117,7 +118,8 @@
 };
 
 class QuantizeTest : public QuantizeTestBase,
-                     public ::testing::TestWithParam<VP8QuantizeParam> {
+                     public ::testing::TestWithParam<VP8QuantizeParam>,
+                     public AbstractBench {
  protected:
   virtual void SetUp() {
     SetupCompressor();
@@ -125,6 +127,10 @@
     c_quant_ = GET_PARAM(1);
   }
 
+  virtual void Run() {
+    asm_quant_(&vp8_comp_->mb.block[0], &macroblockd_dst_->block[0]);
+  }
+
   void RunComparison() {
     for (int i = 0; i < kNumBlocks; ++i) {
       ASM_REGISTER_STATE_CHECK(
@@ -165,6 +171,13 @@
     FillCoeffRandom();
     RunComparison();
   }
+}
+
+TEST_P(QuantizeTest, DISABLED_Speed) {
+  FillCoeffRandom();
+
+  RunNTimes(10000000);
+  PrintMedian("vp8 quantize");
 }
 
 #if HAVE_SSE2
--- a/vp8/encoder/x86/quantize_sse4.c
+++ b/vp8/encoder/x86/quantize_sse4.c
@@ -11,8 +11,8 @@
 #include <smmintrin.h> /* SSE4.1 */
 
 #include "./vp8_rtcd.h"
-#include "vp8/encoder/block.h"
 #include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
+#include "vp8/encoder/block.h"
 
 #define SELECT_EOB(i, z, x, y, q)         \
   do {                                    \
@@ -31,8 +31,7 @@
   char eob = 0;
   short *zbin_boost_ptr = b->zrun_zbin_boost;
 
-  __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0,
-      dqcoeff1;
+  __m128i x0, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, dqcoeff1;
   __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
   __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
   __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
@@ -53,16 +52,10 @@
   zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
   zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);
 
-  /* Sign of z: z >> 15 */
-  sz0 = _mm_srai_epi16(z0, 15);
-  sz1 = _mm_srai_epi16(z1, 15);
+  /* x = abs(z) */
+  x0 = _mm_abs_epi16(z0);
+  x1 = _mm_abs_epi16(z1);
 
-  /* x = abs(z): (z ^ sz) - sz */
-  x0 = _mm_xor_si128(z0, sz0);
-  x1 = _mm_xor_si128(z1, sz1);
-  x0 = _mm_sub_epi16(x0, sz0);
-  x1 = _mm_sub_epi16(x1, sz1);
-
   /* zbin[] + zbin_extra */
   zbin0 = _mm_add_epi16(zbin0, zbin_extra);
   zbin1 = _mm_add_epi16(zbin1, zbin_extra);
@@ -89,11 +82,9 @@
   y0 = _mm_mulhi_epi16(y0, quant_shift0);
   y1 = _mm_mulhi_epi16(y1, quant_shift1);
 
-  /* Return the sign: (y ^ sz) - sz */
-  y0 = _mm_xor_si128(y0, sz0);
-  y1 = _mm_xor_si128(y1, sz1);
-  y0 = _mm_sub_epi16(y0, sz0);
-  y1 = _mm_sub_epi16(y1, sz1);
+  /* Restore the sign. */
+  y0 = _mm_sign_epi16(y0, z0);
+  y1 = _mm_sign_epi16(y1, z1);
 
   /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
   SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0);