shithub: libvpx

--- a/test/avg_test.cc

+++ b/test/avg_test.cc

@@ -234,11 +234,11 @@

 typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff,

                                   const tran_low_t *dqcoeff, int block_size);

-typedef std::tr1::tuple<int, BlockErrorFunc> BlockErrorTestParam;

+typedef std::tr1::tuple<int, BlockErrorFunc> BlockErrorTestFPParam;

-class BlockErrorTest

+class BlockErrorTestFP

     : public ::testing::Test,

-      public ::testing::WithParamInterface<BlockErrorTestParam> {

+      public ::testing::WithParamInterface<BlockErrorTestFPParam> {

  protected:

   virtual void SetUp() {

     txfm_size_ = GET_PARAM(0);

@@ -367,7 +367,7 @@

   Check(expected);

-TEST_P(BlockErrorTest, MinValue) {

+TEST_P(BlockErrorTestFP, MinValue) {

   const int64_t kMin = -32640;

   const int64_t expected = kMin * kMin * txfm_size_;

   FillConstant(kMin, 0);

@@ -374,7 +374,7 @@

   Check(expected);

-TEST_P(BlockErrorTest, MaxValue) {

+TEST_P(BlockErrorTestFP, MaxValue) {

   const int64_t kMax = 32640;

   const int64_t expected = kMax * kMax * txfm_size_;

   FillConstant(kMax, 0);

@@ -381,7 +381,7 @@

   Check(expected);

-TEST_P(BlockErrorTest, Random) {

+TEST_P(BlockErrorTestFP, Random) {

   int64_t expected;

   switch (txfm_size_) {

     case 16: expected = 2051681432; break;

@@ -410,7 +410,7 @@

                                           make_tuple(1024, &vpx_satd_c)));

 INSTANTIATE_TEST_CASE_P(

-    C, BlockErrorTest,

+    C, BlockErrorTestFP,

     ::testing::Values(make_tuple(16, &vp9_block_error_fp_c),

                       make_tuple(64, &vp9_block_error_fp_c),

                       make_tuple(256, &vp9_block_error_fp_c),

@@ -447,7 +447,7 @@

                                           make_tuple(1024, &vpx_satd_sse2)));

 INSTANTIATE_TEST_CASE_P(

-    SSE2, BlockErrorTest,

+    SSE2, BlockErrorTestFP,

     ::testing::Values(make_tuple(16, &vp9_block_error_fp_sse2),

                       make_tuple(64, &vp9_block_error_fp_sse2),

                       make_tuple(256, &vp9_block_error_fp_sse2),

@@ -488,7 +488,7 @@

 // in place.

 #if !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

-    NEON, BlockErrorTest,

+    NEON, BlockErrorTestFP,

     ::testing::Values(make_tuple(16, &vp9_block_error_fp_neon),

                       make_tuple(64, &vp9_block_error_fp_neon),

                       make_tuple(256, &vp9_block_error_fp_neon),

--- a/test/test.mk

+++ b/test/test.mk

@@ -157,7 +157,7 @@

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc

-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc

+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc

--- /dev/null

+++ b/test/vp9_block_error_test.cc

@@ -1,0 +1,198 @@

+/*

+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <cmath>

+#include <cstdlib>

+#include <string>

+#include "third_party/googletest/src/include/gtest/gtest.h"

+#include "./vpx_config.h"

+#include "./vp9_rtcd.h"

+#include "test/acm_random.h"

+#include "test/clear_system_state.h"

+#include "test/register_state_check.h"

+#include "test/util.h"

+#include "vp9/common/vp9_entropy.h"

+#include "vpx/vpx_codec.h"

+#include "vpx/vpx_integer.h"

+#include "vpx_dsp/vpx_dsp_common.h"

+using libvpx_test::ACMRandom;

+namespace {

+const int kNumIterations = 1000;

+typedef int64_t (*HBDBlockErrorFunc)(const tran_low_t *coeff,

+                                     const tran_low_t *dqcoeff,

+                                     intptr_t block_size, int64_t *ssz,

+                                     int bps);

+typedef std::tr1::tuple<HBDBlockErrorFunc, HBDBlockErrorFunc, vpx_bit_depth_t>

+    BlockErrorParam;

+typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff,

+                                  const tran_low_t *dqcoeff,

+                                  intptr_t block_size, int64_t *ssz);

+template <BlockErrorFunc fn>

+int64_t BlockError8BitWrapper(const tran_low_t *coeff,

+                              const tran_low_t *dqcoeff, intptr_t block_size,

+                              int64_t *ssz, int bps) {

+  EXPECT_EQ(bps, 8);

+  return fn(coeff, dqcoeff, block_size, ssz);

+}

+class BlockErrorTest : public ::testing::TestWithParam<BlockErrorParam> {

+ public:

+  virtual ~BlockErrorTest() {}

+  virtual void SetUp() {

+    error_block_op_ = GET_PARAM(0);

+    ref_error_block_op_ = GET_PARAM(1);

+    bit_depth_ = GET_PARAM(2);

+  }

+  virtual void TearDown() { libvpx_test::ClearSystemState(); }

+ protected:

+  vpx_bit_depth_t bit_depth_;

+  HBDBlockErrorFunc error_block_op_;

+  HBDBlockErrorFunc ref_error_block_op_;

+};

+TEST_P(BlockErrorTest, OperationCheck) {

+  ACMRandom rnd(ACMRandom::DeterministicSeed());

+  DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);

+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);

+  int err_count_total = 0;

+  int first_failure = -1;

+  intptr_t block_size;

+  int64_t ssz;

+  int64_t ret;

+  int64_t ref_ssz;

+  int64_t ref_ret;

+  const int msb = bit_depth_ + 8 - 1;

+  for (int i = 0; i < kNumIterations; ++i) {

+    int err_count = 0;

+    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64

+    for (int j = 0; j < block_size; j++) {

+      // coeff and dqcoeff will always have at least the same sign, and this

+      // can be used for optimization, so generate test input precisely.

+      if (rnd(2)) {

+        // Positive number

+        coeff[j] = rnd(1 << msb);

+        dqcoeff[j] = rnd(1 << msb);

+      } else {

+        // Negative number

+        coeff[j] = -rnd(1 << msb);

+        dqcoeff[j] = -rnd(1 << msb);

+      }

+    }

+    ref_ret =

+        ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);

+    ASM_REGISTER_STATE_CHECK(

+        ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_));

+    err_count += (ref_ret != ret) | (ref_ssz != ssz);

+    if (err_count && !err_count_total) {

+      first_failure = i;

+    }

+    err_count_total += err_count;

+  }

+  EXPECT_EQ(0, err_count_total)

+      << "Error: Error Block Test, C output doesn't match optimized output. "

+      << "First failed at test case " << first_failure;

+}

+TEST_P(BlockErrorTest, ExtremeValues) {

+  ACMRandom rnd(ACMRandom::DeterministicSeed());

+  DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);

+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);

+  int err_count_total = 0;

+  int first_failure = -1;

+  intptr_t block_size;

+  int64_t ssz;

+  int64_t ret;

+  int64_t ref_ssz;

+  int64_t ref_ret;

+  const int msb = bit_depth_ + 8 - 1;

+  int max_val = ((1 << msb) - 1);

+  for (int i = 0; i < kNumIterations; ++i) {

+    int err_count = 0;

+    int k = (i / 9) % 9;

+    // Change the maximum coeff value, to test different bit boundaries

+    if (k == 8 && (i % 9) == 0) {

+      max_val >>= 1;

+    }

+    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64

+    for (int j = 0; j < block_size; j++) {

+      if (k < 4) {

+        // Test at positive maximum values

+        coeff[j] = k % 2 ? max_val : 0;

+        dqcoeff[j] = (k >> 1) % 2 ? max_val : 0;

+      } else if (k < 8) {

+        // Test at negative maximum values

+        coeff[j] = k % 2 ? -max_val : 0;

+        dqcoeff[j] = (k >> 1) % 2 ? -max_val : 0;

+      } else {

+        if (rnd(2)) {

+          // Positive number

+          coeff[j] = rnd(1 << 14);

+          dqcoeff[j] = rnd(1 << 14);

+        } else {

+          // Negative number

+          coeff[j] = -rnd(1 << 14);

+          dqcoeff[j] = -rnd(1 << 14);

+        }

+      }

+    }

+    ref_ret =

+        ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);

+    ASM_REGISTER_STATE_CHECK(

+        ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_));

+    err_count += (ref_ret != ret) | (ref_ssz != ssz);

+    if (err_count && !err_count_total) {

+      first_failure = i;

+    }

+    err_count_total += err_count;

+  }

+  EXPECT_EQ(0, err_count_total)

+      << "Error: Error Block Test, C output doesn't match optimized output. "

+      << "First failed at test case " << first_failure;

+}

+using std::tr1::make_tuple;

+#if HAVE_SSE2

+const BlockErrorParam sse2_block_error_tests[] = {

+#if CONFIG_VP9_HIGHBITDEPTH

+  make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,

+             VPX_BITS_10),

+  make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,

+             VPX_BITS_12),

+  make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,

+             VPX_BITS_8),

+#endif  // CONFIG_VP9_HIGHBITDEPTH

+  make_tuple(&BlockError8BitWrapper<vp9_block_error_sse2>,

+             &BlockError8BitWrapper<vp9_block_error_c>, VPX_BITS_8)

+};

+INSTANTIATE_TEST_CASE_P(SSE2, BlockErrorTest,

+                        ::testing::ValuesIn(sse2_block_error_tests));

+#endif  // HAVE_SSE2

+#if HAVE_AVX2

+INSTANTIATE_TEST_CASE_P(

+    AVX2, BlockErrorTest,

+    ::testing::Values(make_tuple(&BlockError8BitWrapper<vp9_block_error_avx2>,

+                                 &BlockError8BitWrapper<vp9_block_error_c>,

+                                 VPX_BITS_8)));

+#endif  // HAVE_AVX2

+}  // namespace

--- a/test/vp9_error_block_test.cc

+++ /dev/null

@@ -1,199 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <cmath>

-#include <cstdlib>

-#include <string>

-#include "third_party/googletest/src/include/gtest/gtest.h"

-#include "./vpx_config.h"

-#include "./vp9_rtcd.h"

-#include "test/acm_random.h"

-#include "test/clear_system_state.h"

-#include "test/register_state_check.h"

-#include "test/util.h"

-#include "vp9/common/vp9_entropy.h"

-#include "vpx/vpx_codec.h"

-#include "vpx/vpx_integer.h"

-using libvpx_test::ACMRandom;

-namespace {

-#if CONFIG_VP9_HIGHBITDEPTH

-const int kNumIterations = 1000;

-typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,

-                                  const tran_low_t *dqcoeff,

-                                  intptr_t block_size, int64_t *ssz, int bps);

-typedef std::tr1::tuple<ErrorBlockFunc, ErrorBlockFunc, vpx_bit_depth_t>

-    ErrorBlockParam;

-// wrapper for 8-bit block error functions without a 'bps' param.

-typedef int64_t (*HighBdBlockError8bit)(const tran_low_t *coeff,

-                                        const tran_low_t *dqcoeff,

-                                        intptr_t block_size, int64_t *ssz);

-template <HighBdBlockError8bit fn>

-int64_t HighBdBlockError8bitWrapper(const tran_low_t *coeff,

-                                    const tran_low_t *dqcoeff,

-                                    intptr_t block_size, int64_t *ssz,

-                                    int bps) {

-  EXPECT_EQ(8, bps);

-  return fn(coeff, dqcoeff, block_size, ssz);

-}

-class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {

- public:

-  virtual ~ErrorBlockTest() {}

-  virtual void SetUp() {

-    error_block_op_ = GET_PARAM(0);

-    ref_error_block_op_ = GET_PARAM(1);

-    bit_depth_ = GET_PARAM(2);

-  }

-  virtual void TearDown() { libvpx_test::ClearSystemState(); }

- protected:

-  vpx_bit_depth_t bit_depth_;

-  ErrorBlockFunc error_block_op_;

-  ErrorBlockFunc ref_error_block_op_;

-};

-TEST_P(ErrorBlockTest, OperationCheck) {

-  ACMRandom rnd(ACMRandom::DeterministicSeed());

-  DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);

-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);

-  int err_count_total = 0;

-  int first_failure = -1;

-  intptr_t block_size;

-  int64_t ssz;

-  int64_t ret;

-  int64_t ref_ssz;

-  int64_t ref_ret;

-  const int msb = bit_depth_ + 8 - 1;

-  for (int i = 0; i < kNumIterations; ++i) {

-    int err_count = 0;

-    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64

-    for (int j = 0; j < block_size; j++) {

-      // coeff and dqcoeff will always have at least the same sign, and this

-      // can be used for optimization, so generate test input precisely.

-      if (rnd(2)) {

-        // Positive number

-        coeff[j] = rnd(1 << msb);

-        dqcoeff[j] = rnd(1 << msb);

-      } else {

-        // Negative number

-        coeff[j] = -rnd(1 << msb);

-        dqcoeff[j] = -rnd(1 << msb);

-      }

-    }

-    ref_ret =

-        ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);

-    ASM_REGISTER_STATE_CHECK(

-        ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_));

-    err_count += (ref_ret != ret) | (ref_ssz != ssz);

-    if (err_count && !err_count_total) {

-      first_failure = i;

-    }

-    err_count_total += err_count;

-  }

-  EXPECT_EQ(0, err_count_total)

-      << "Error: Error Block Test, C output doesn't match optimized output. "

-      << "First failed at test case " << first_failure;

-}

-TEST_P(ErrorBlockTest, ExtremeValues) {

-  ACMRandom rnd(ACMRandom::DeterministicSeed());

-  DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);

-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);

-  int err_count_total = 0;

-  int first_failure = -1;

-  intptr_t block_size;

-  int64_t ssz;

-  int64_t ret;

-  int64_t ref_ssz;

-  int64_t ref_ret;

-  const int msb = bit_depth_ + 8 - 1;

-  int max_val = ((1 << msb) - 1);

-  for (int i = 0; i < kNumIterations; ++i) {

-    int err_count = 0;

-    int k = (i / 9) % 9;

-    // Change the maximum coeff value, to test different bit boundaries

-    if (k == 8 && (i % 9) == 0) {

-      max_val >>= 1;

-    }

-    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64

-    for (int j = 0; j < block_size; j++) {

-      if (k < 4) {

-        // Test at positive maximum values

-        coeff[j] = k % 2 ? max_val : 0;

-        dqcoeff[j] = (k >> 1) % 2 ? max_val : 0;

-      } else if (k < 8) {

-        // Test at negative maximum values

-        coeff[j] = k % 2 ? -max_val : 0;

-        dqcoeff[j] = (k >> 1) % 2 ? -max_val : 0;

-      } else {

-        if (rnd(2)) {

-          // Positive number

-          coeff[j] = rnd(1 << 14);

-          dqcoeff[j] = rnd(1 << 14);

-        } else {

-          // Negative number

-          coeff[j] = -rnd(1 << 14);

-          dqcoeff[j] = -rnd(1 << 14);

-        }

-      }

-    }

-    ref_ret =

-        ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);

-    ASM_REGISTER_STATE_CHECK(

-        ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_));

-    err_count += (ref_ret != ret) | (ref_ssz != ssz);

-    if (err_count && !err_count_total) {

-      first_failure = i;

-    }

-    err_count_total += err_count;

-  }

-  EXPECT_EQ(0, err_count_total)

-      << "Error: Error Block Test, C output doesn't match optimized output. "

-      << "First failed at test case " << first_failure;

-}

-using std::tr1::make_tuple;

-#if HAVE_SSE2

-INSTANTIATE_TEST_CASE_P(

-    SSE2, ErrorBlockTest,

-    ::testing::Values(

-        make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,

-                   VPX_BITS_10),

-        make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,

-                   VPX_BITS_12),

-        make_tuple(&vp9_highbd_block_error_sse2, &vp9_highbd_block_error_c,

-                   VPX_BITS_8),

-        make_tuple(

-            &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_sse2>,

-            &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_c>,

-            VPX_BITS_8)));

-#endif  // HAVE_SSE2

-#if HAVE_AVX

-INSTANTIATE_TEST_CASE_P(

-    AVX, ErrorBlockTest,

-    ::testing::Values(make_tuple(

-        &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_avx>,

-        &HighBdBlockError8bitWrapper<vp9_highbd_block_error_8bit_c>,

-        VPX_BITS_8)));

-#endif  // HAVE_AVX

-#endif  // CONFIG_VP9_HIGHBITDEPTH

-}  // namespace

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -130,9 +130,6 @@

   add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";

   specialize qw/vp9_highbd_block_error sse2/;

-  add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";

-  specialize qw/vp9_highbd_block_error_8bit sse2 avx/;

   add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";

   specialize qw/vp9_block_error_fp sse2/;

--- a/vp9/encoder/vp9_rdopt.c

+++ b/vp9/encoder/vp9_rdopt.c

@@ -284,22 +284,12 @@

   return error;

-int64_t vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,

-                                      const tran_low_t *dqcoeff,

-                                      intptr_t block_size, int64_t *ssz) {

-  // Note that the C versions of these 2 functions (vp9_block_error and

-  // vp9_highbd_block_error_8bit are the same, but the optimized assembly

-  // routines are not compatible in the non high bitdepth configuration, so

-  // they still cannot share the same name.

-  return vp9_block_error_c(coeff, dqcoeff, block_size, ssz);

-}

 static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff,

                                                const tran_low_t *dqcoeff,

                                                intptr_t block_size,

                                                int64_t *ssz, int bd) {

   if (bd == 8) {

-    return vp9_highbd_block_error_8bit(coeff, dqcoeff, block_size, ssz);

+    return vp9_block_error(coeff, dqcoeff, block_size, ssz);

   } else {

     return vp9_highbd_block_error(coeff, dqcoeff, block_size, ssz, bd);

@@ -1130,16 +1120,9 @@

           ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,

                                so->neighbors, cpi->sf.use_fast_coef_costing);

           tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;

-#if CONFIG_VP9_HIGHBITDEPTH

-          distortion +=

-              vp9_highbd_block_error_8bit(

-                  coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &unused) >>

-              2;

-#else

           distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),

                                         16, &unused) >>

2;

-#endif

           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)

             goto next;

           vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block), dst,

--- a/vp9/encoder/x86/vp9_highbd_error_avx.asm

+++ /dev/null

@@ -1,261 +1,0 @@

-;

-;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%define private_prefix vp9

-%include "third_party/x86inc/x86inc.asm"

-SECTION .text

-ALIGN 16

-;

-; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,

-;                                     intptr_t block_size, int64_t *ssz)

-;

-INIT_XMM avx

-cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz

-  vzeroupper

-  ; If only one iteration is required, then handle this as a special case.

-  ; It is the most frequent case, so we can have a significant gain here

-  ; by not setting up a loop and accumulators.

-  cmp    sizeq, 16

-  jne   .generic

-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-  ;; Common case of size == 16

-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-  ; Load input vectors

-  mova      xm0, [dqcq]

-  packssdw  xm0, [dqcq+16]

-  mova      xm2, [uqcq]

-  packssdw  xm2, [uqcq+16]

-  mova      xm1, [dqcq+32]

-  packssdw  xm1, [dqcq+48]

-  mova      xm3, [uqcq+32]

-  packssdw  xm3, [uqcq+48]

-  ; Compute the errors.

-  psubw     xm0, xm2

-  psubw     xm1, xm3

-  ; Individual errors are max 15bit+sign, so squares are 30bit, and

-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).

-  pmaddwd   xm2, xm2

-  pmaddwd   xm3, xm3

-  pmaddwd   xm0, xm0

-  pmaddwd   xm1, xm1

-  ; Squares are always positive, so we can use unsigned arithmetic after

-  ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will

-  ; fit in 32bits

-  paddd     xm2, xm3

-  paddd     xm0, xm1

-  ; Accumulate horizontally in 64 bits, there is no chance of overflow here

-  pxor      xm5, xm5

-  pblendw   xm3, xm5, xm2, 0x33 ; Zero extended  low of a pair of 32 bits

-  psrlq     xm2, 32             ; Zero extended high of a pair of 32 bits

-  pblendw   xm1, xm5, xm0, 0x33 ; Zero extended  low of a pair of 32 bits

-  psrlq     xm0, 32             ; Zero extended high of a pair of 32 bits

-  paddq     xm2, xm3

-  paddq     xm0, xm1

-  psrldq    xm3, xm2, 8

-  psrldq    xm1, xm0, 8

-  paddq     xm2, xm3

-  paddq     xm0, xm1

-  ; Store the return value

-%if ARCH_X86_64

-  movq      rax, xm0

-  movq   [sszq], xm2

-%else

-  movd      eax, xm0

-  pextrd    edx, xm0, 1

-  movq   [sszd], xm2

-%endif

-  RET

-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-  ;; Generic case of size != 16, speculative low precision

-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-  ALIGN 16

-.generic:

-  pxor      xm4, xm4                ; sse accumulator

-  pxor      xm5, xm5                ; overflow detection register for xm4

-  pxor      xm6, xm6                ; ssz accumulator

-  pxor      xm7, xm7                ; overflow detection register for xm6

-  lea      uqcq, [uqcq+sizeq*4]

-  lea      dqcq, [dqcq+sizeq*4]

-  neg     sizeq

-  ; Push the negative size as the high precision code might need it

-  push    sizeq

-.loop:

-  ; Load input vectors

-  mova      xm0, [dqcq+sizeq*4]

-  packssdw  xm0, [dqcq+sizeq*4+16]

-  mova      xm2, [uqcq+sizeq*4]

-  packssdw  xm2, [uqcq+sizeq*4+16]

-  mova      xm1, [dqcq+sizeq*4+32]

-  packssdw  xm1, [dqcq+sizeq*4+48]

-  mova      xm3, [uqcq+sizeq*4+32]

-  packssdw  xm3, [uqcq+sizeq*4+48]

-  add     sizeq, 16

-  ; Compute the squared errors.

-  ; Individual errors are max 15bit+sign, so squares are 30bit, and

-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).

-  psubw     xm0, xm2

-  pmaddwd   xm2, xm2

-  pmaddwd   xm0, xm0

-  psubw     xm1, xm3

-  pmaddwd   xm3, xm3

-  pmaddwd   xm1, xm1

-  ; Squares are always positive, so we can use unsigned arithmetic after

-  ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will

-  ; fit in 32bits

-  paddd     xm2, xm3

-  paddd     xm0, xm1

-  ; We accumulate using 32 bit arithmetic, but detect potential overflow

-  ; by checking if the MSB of the accumulators have ever been a set bit.

-  ; If yes, we redo the whole compute at the end on higher precision, but

-  ; this happens extremely rarely, so we still achieve a net gain.

-  paddd     xm4, xm0

-  paddd     xm6, xm2

-  por       xm5, xm4  ; OR in the accumulator for overflow detection

-  por       xm7, xm6  ; OR in the accumulator for overflow detection

-  jnz .loop

-  ; Add pairs horizontally (still only on 32 bits)

-  phaddd    xm4, xm4

-  por       xm5, xm4  ; OR in the accumulator for overflow detection

-  phaddd    xm6, xm6

-  por       xm7, xm6  ; OR in the accumulator for overflow detection

-  ; Check for possibility of overflow by testing if bit 32 of each dword lane

-  ; have ever been set. If they were not, then there was no overflow and the

-  ; final sum will fit in 32 bits. If overflow happened, then

-  ; we redo the whole computation on higher precision.

-  por       xm7, xm5

-  pmovmskb   r4, xm7

-  test       r4, 0x8888

-  jnz .highprec

-  phaddd    xm4, xm4

-  phaddd    xm6, xm6

-  pmovzxdq  xm4, xm4

-  pmovzxdq  xm6, xm6

-  ; Restore stack

-  pop     sizeq

-  ; Store the return value

-%if ARCH_X86_64

-  movq      rax, xm4

-  movq   [sszq], xm6

-%else

-  movd      eax, xm4

-  pextrd    edx, xm4, 1

-  movq   [sszd], xm6

-%endif

-  RET

-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-  ;; Generic case of size != 16, high precision case

-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-.highprec:

-  pxor      xm4, xm4                 ; sse accumulator

-  pxor      xm5, xm5                 ; dedicated zero register

-  pxor      xm6, xm6                 ; ssz accumulator

-  pop     sizeq

-.loophp:

-  mova      xm0, [dqcq+sizeq*4]

-  packssdw  xm0, [dqcq+sizeq*4+16]

-  mova      xm2, [uqcq+sizeq*4]

-  packssdw  xm2, [uqcq+sizeq*4+16]

-  mova      xm1, [dqcq+sizeq*4+32]

-  packssdw  xm1, [dqcq+sizeq*4+48]

-  mova      xm3, [uqcq+sizeq*4+32]

-  packssdw  xm3, [uqcq+sizeq*4+48]

-  add     sizeq, 16

-  ; individual errors are max. 15bit+sign, so squares are 30bit, and

-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)

-  psubw     xm0, xm2

-  pmaddwd   xm2, xm2

-  pmaddwd   xm0, xm0

-  psubw     xm1, xm3

-  pmaddwd   xm3, xm3

-  pmaddwd   xm1, xm1

-  ; accumulate in 64bit

-  punpckldq xm7, xm0, xm5

-  punpckhdq xm0, xm5

-  paddq     xm4, xm7

-  punpckldq xm7, xm2, xm5

-  punpckhdq xm2, xm5

-  paddq     xm6, xm7

-  punpckldq xm7, xm1, xm5

-  punpckhdq xm1, xm5

-  paddq     xm4, xm7

-  punpckldq xm7, xm3, xm5

-  punpckhdq xm3, xm5

-  paddq     xm6, xm7

-  paddq     xm4, xm0

-  paddq     xm4, xm1

-  paddq     xm6, xm2

-  paddq     xm6, xm3

-  jnz .loophp

-  ; Accumulate horizontally

-  movhlps   xm5, xm4

-  movhlps   xm7, xm6

-  paddq     xm4, xm5

-  paddq     xm6, xm7

-  ; Store the return value

-%if ARCH_X86_64

-  movq      rax, xm4

-  movq   [sszq], xm6

-%else

-  movd      eax, xm4

-  pextrd    edx, xm4, 1

-  movq   [sszd], xm6

-%endif

-  RET

-END

--- a/vp9/encoder/x86/vp9_highbd_error_sse2.asm

+++ /dev/null

@@ -1,98 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%define private_prefix vp9

-%include "third_party/x86inc/x86inc.asm"

-SECTION .text

-ALIGN 16

-;

-; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,

-;                                     intptr_t block_size, int64_t *ssz)

-;

-INIT_XMM sse2

-cglobal highbd_block_error_8bit, 3, 3, 8, uqc, dqc, size, ssz

-  pxor      m4, m4                 ; sse accumulator

-  pxor      m6, m6                 ; ssz accumulator

-  pxor      m5, m5                 ; dedicated zero register

-  lea     uqcq, [uqcq+sizeq*4]

-  lea     dqcq, [dqcq+sizeq*4]

-  neg    sizeq

-  ALIGN 16

-.loop:

-  mova      m0, [dqcq+sizeq*4]

-  packssdw  m0, [dqcq+sizeq*4+mmsize]

-  mova      m2, [uqcq+sizeq*4]

-  packssdw  m2, [uqcq+sizeq*4+mmsize]

-  mova      m1, [dqcq+sizeq*4+mmsize*2]

-  packssdw  m1, [dqcq+sizeq*4+mmsize*3]

-  mova      m3, [uqcq+sizeq*4+mmsize*2]

-  packssdw  m3, [uqcq+sizeq*4+mmsize*3]

-  add    sizeq, mmsize

-  ; individual errors are max. 15bit+sign, so squares are 30bit, and

-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)

-  psubw     m0, m2

-  pmaddwd   m2, m2

-  pmaddwd   m0, m0

-  psubw     m1, m3

-  pmaddwd   m3, m3

-  pmaddwd   m1, m1

-  ; accumulate in 64bit

-  punpckldq m7, m0, m5

-  punpckhdq m0, m5

-  paddq     m4, m7

-  punpckldq m7, m2, m5

-  punpckhdq m2, m5

-  paddq     m6, m7

-  punpckldq m7, m1, m5

-  punpckhdq m1, m5

-  paddq     m4, m7

-  punpckldq m7, m3, m5

-  punpckhdq m3, m5

-  paddq     m6, m7

-  paddq     m4, m0

-  paddq     m4, m1

-  paddq     m6, m2

-  paddq     m6, m3

-  jnz .loop

-  ; accumulate horizontally and store in return value

-  movhlps   m5, m4

-  movhlps   m7, m6

-  paddq     m4, m5

-  paddq     m6, m7

-%if ARCH_X86_64

-  movq    rax, m4

-  movq [sszq], m6

-%else

-  mov     eax, sszm

-  pshufd   m5, m4, 0x1

-  movq  [eax], m6

-  movd    eax, m4

-  movd    edx, m5

-%endif

-  RET

--- a/vp9/vp9cx.mk

+++ b/vp9/vp9cx.mk

@@ -108,10 +108,6 @@

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm

-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)

-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm

-VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm

-endif

 ifeq ($(ARCH_X86_64),yes)

 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm