ref: 7eee487c0036340e99425cc1cf1503e21e70678a
parent: 08d86bc9043f55d86f20f4bab74bc4ca949b3a4c
author: Peter de Rivaz <[email protected]>
date: Thu Oct 16 09:41:55 EDT 2014
Added highbitdepth sse2 SAD acceleration and tests Change-Id: I1a74a1b032b198793ef9cc526327987f7799125f (cherry picked from commit b1a6f6b9cb47eafe0ce86eaf0318612806091fe5)
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -27,6 +27,7 @@
#include "test/register_state_check.h"
#include "test/util.h"
#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vpx/vpx_codec.h"
#if CONFIG_VP8_ENCODER
@@ -35,7 +36,7 @@
const unsigned char *reference_ptr,
int reference_stride,
unsigned int max_sad);
-typedef std::tr1::tuple<int, int, SadMxNFunc> SadMxNParam;
+typedef std::tr1::tuple<int, int, SadMxNFunc, int> SadMxNParam;
#endif
#if CONFIG_VP9_ENCODER
typedef unsigned int (*SadMxNVp9Func)(const unsigned char *source_ptr,
@@ -42,15 +43,21 @@
int source_stride,
const unsigned char *reference_ptr,
int reference_stride);
-typedef std::tr1::tuple<int, int, SadMxNVp9Func> SadMxNVp9Param;
+typedef std::tr1::tuple<int, int, SadMxNVp9Func, int> SadMxNVp9Param;
+typedef uint32_t (*SadMxNAvgVp9Func)(const uint8_t *source_ptr,
+ int source_stride,
+ const uint8_t *reference_ptr,
+ int reference_stride,
+ const uint8_t *second_pred);
+typedef std::tr1::tuple<int, int, SadMxNAvgVp9Func, int> SadMxNAvgVp9Param;
#endif
typedef void (*SadMxNx4Func)(const uint8_t *src_ptr,
int src_stride,
- const unsigned char *const ref_ptr[],
+ const uint8_t *const ref_ptr[],
int ref_stride,
- unsigned int *sad_array);
-typedef std::tr1::tuple<int, int, SadMxNx4Func> SadMxNx4Param;
+ uint32_t *sad_array);
+typedef std::tr1::tuple<int, int, SadMxNx4Func, int> SadMxNx4Param;
using libvpx_test::ACMRandom;
@@ -57,20 +64,55 @@
namespace {
class SADTestBase : public ::testing::Test {
public:
- SADTestBase(int width, int height) : width_(width), height_(height) {}
+ SADTestBase(int width, int height, int bit_depth) :
+ width_(width), height_(height), bd_(bit_depth) {}
static void SetUpTestCase() {
+#if CONFIG_VP9_HIGHBITDEPTH
+ source_data8_ = reinterpret_cast<uint8_t*>(
+ vpx_memalign(kDataAlignment, kDataBlockSize));
+ reference_data8_ = reinterpret_cast<uint8_t*>(
+ vpx_memalign(kDataAlignment, kDataBufferSize));
+ second_pred8_ = reinterpret_cast<uint8_t*>(
+ vpx_memalign(kDataAlignment, 64*64));
+ source_data16_ = reinterpret_cast<uint16_t*>(
+ vpx_memalign(kDataAlignment, kDataBlockSize*sizeof(uint16_t)));
+ reference_data16_ = reinterpret_cast<uint16_t*>(
+ vpx_memalign(kDataAlignment, kDataBufferSize*sizeof(uint16_t)));
+ second_pred16_ = reinterpret_cast<uint16_t*>(
+ vpx_memalign(kDataAlignment, 64*64*sizeof(uint16_t)));
+#else
source_data_ = reinterpret_cast<uint8_t*>(
vpx_memalign(kDataAlignment, kDataBlockSize));
reference_data_ = reinterpret_cast<uint8_t*>(
vpx_memalign(kDataAlignment, kDataBufferSize));
+ second_pred_ = reinterpret_cast<uint8_t*>(
+ vpx_memalign(kDataAlignment, 64*64));
+#endif
}
static void TearDownTestCase() {
+#if CONFIG_VP9_HIGHBITDEPTH
+ vpx_free(source_data8_);
+ source_data8_ = NULL;
+ vpx_free(reference_data8_);
+ reference_data8_ = NULL;
+ vpx_free(second_pred8_);
+ second_pred8_ = NULL;
+ vpx_free(source_data16_);
+ source_data16_ = NULL;
+ vpx_free(reference_data16_);
+ reference_data16_ = NULL;
+ vpx_free(second_pred16_);
+ second_pred16_ = NULL;
+#else
vpx_free(source_data_);
source_data_ = NULL;
vpx_free(reference_data_);
reference_data_ = NULL;
+ vpx_free(second_pred_);
+ second_pred_ = NULL;
+#endif
}
virtual void TearDown() {
@@ -84,13 +126,38 @@
static const int kDataBufferSize = 4 * kDataBlockSize;
virtual void SetUp() {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (bd_ == -1) {
+ use_high_bit_depth_ = false;
+ bit_depth_ = VPX_BITS_8;
+ source_data_ = source_data8_;
+ reference_data_ = reference_data8_;
+ second_pred_ = second_pred8_;
+ } else {
+ use_high_bit_depth_ = true;
+ bit_depth_ = static_cast<vpx_bit_depth_t>(bd_);
+ source_data_ = CONVERT_TO_BYTEPTR(source_data16_);
+ reference_data_ = CONVERT_TO_BYTEPTR(reference_data16_);
+ second_pred_ = CONVERT_TO_BYTEPTR(second_pred16_);
+ }
+#endif
+ mask_ = (1 << bit_depth_) - 1;
source_stride_ = (width_ + 31) & ~31;
reference_stride_ = width_ * 2;
rnd_.Reset(ACMRandom::DeterministicSeed());
}
- virtual uint8_t* GetReference(int block_idx) {
+ virtual uint8_t *GetReference(int block_idx) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (!use_high_bit_depth_) {
+ return reference_data_ + block_idx * kDataBlockSize;
+ } else {
+ return CONVERT_TO_BYTEPTR(CONVERT_TO_SHORTPTR(reference_data_) +
+ block_idx * kDataBlockSize);
+ }
+#else
return reference_data_ + block_idx * kDataBlockSize;
+#endif
}
// Sum of Absolute Differences. Given two blocks, calculate the absolute
@@ -97,12 +164,79 @@
// difference between two pixels in the same relative location; accumulate.
unsigned int ReferenceSAD(unsigned int max_sad, int block_idx) {
unsigned int sad = 0;
- const uint8_t* const reference = GetReference(block_idx);
+#if CONFIG_VP9_HIGHBITDEPTH
+ const uint8_t *const reference8 = GetReference(block_idx);
+ const uint8_t *const source8 = source_data_;
+ const uint16_t *const reference16 =
+ CONVERT_TO_SHORTPTR(GetReference(block_idx));
+ const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+#else
+ const uint8_t *const reference = GetReference(block_idx);
+ const uint8_t *const source = source_data_;
+#endif
+ for (int h = 0; h < height_; ++h) {
+ for (int w = 0; w < width_; ++w) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (!use_high_bit_depth_) {
+ sad +=
+ abs(source8[h * source_stride_ + w] -
+ reference8[h * reference_stride_ + w]);
+ } else {
+ sad +=
+ abs(source16[h * source_stride_ + w] -
+ reference16[h * reference_stride_ + w]);
+ }
+#else
+ sad +=
+ abs(source[h * source_stride_ + w] -
+ reference[h * reference_stride_ + w]);
+#endif
+ }
+ if (sad > max_sad) {
+ break;
+ }
+ }
+ return sad;
+ }
+ // Sum of Absolute Differences Average. Given two blocks, and a prediction
+ // calculate the absolute difference between one pixel and average of the
+ // corresponding and predicted pixels; accumulate.
+ unsigned int ReferenceSADavg(unsigned int max_sad, int block_idx) {
+ unsigned int sad = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+ const uint8_t *const reference8 = GetReference(block_idx);
+ const uint8_t *const source8 = source_data_;
+ const uint8_t *const second_pred8 = second_pred_;
+ const uint16_t *const reference16 =
+ CONVERT_TO_SHORTPTR(GetReference(block_idx));
+ const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+ const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
+#else
+ const uint8_t *const reference = GetReference(block_idx);
+ const uint8_t *const source = source_data_;
+ const uint8_t *const second_pred = second_pred_;
+#endif
for (int h = 0; h < height_; ++h) {
for (int w = 0; w < width_; ++w) {
- sad += abs(source_data_[h * source_stride_ + w]
- - reference[h * reference_stride_ + w]);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (!use_high_bit_depth_) {
+ const int tmp = second_pred8[h * width_ + w] +
+ reference8[h * reference_stride_ + w];
+ const uint8_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1);
+ sad += abs(source8[h * source_stride_ + w] - comp_pred);
+ } else {
+ const int tmp = second_pred16[h * width_ + w] +
+ reference16[h * reference_stride_ + w];
+ const uint16_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1);
+ sad += abs(source16[h * source_stride_ + w] - comp_pred);
+ }
+#else
+ const int tmp = second_pred[h * width_ + w] +
+ reference[h * reference_stride_ + w];
+ const uint8_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1);
+ sad += abs(source[h * source_stride_ + w] - comp_pred);
+#endif
}
if (sad > max_sad) {
break;
@@ -111,26 +245,61 @@
return sad;
}
- void FillConstant(uint8_t *data, int stride, uint8_t fill_constant) {
+ void FillConstant(uint8_t *data, int stride, uint16_t fill_constant) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ uint8_t *data8 = data;
+ uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+#endif
for (int h = 0; h < height_; ++h) {
for (int w = 0; w < width_; ++w) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (!use_high_bit_depth_) {
+ data8[h * stride + w] = fill_constant;
+ } else {
+ data16[h * stride + w] = fill_constant;
+ }
+#else
data[h * stride + w] = fill_constant;
+#endif
}
}
}
void FillRandom(uint8_t *data, int stride) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ uint8_t *data8 = data;
+ uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+#endif
for (int h = 0; h < height_; ++h) {
for (int w = 0; w < width_; ++w) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (!use_high_bit_depth_) {
+ data8[h * stride + w] = rnd_.Rand8();
+ } else {
+ data16[h * stride + w] = rnd_.Rand16() & mask_;
+ }
+#else
data[h * stride + w] = rnd_.Rand8();
+#endif
}
}
}
- int width_, height_;
- static uint8_t* source_data_;
+ int width_, height_, mask_, bd_;
+ vpx_bit_depth_t bit_depth_;
+ static uint8_t *source_data_;
+ static uint8_t *reference_data_;
+ static uint8_t *second_pred_;
int source_stride_;
- static uint8_t* reference_data_;
+#if CONFIG_VP9_HIGHBITDEPTH
+ bool use_high_bit_depth_;
+ static uint8_t *source_data8_;
+ static uint8_t *reference_data8_;
+ static uint8_t *second_pred8_;
+ static uint16_t *source_data16_;
+ static uint16_t *reference_data16_;
+ static uint16_t *second_pred16_;
+#endif
int reference_stride_;
ACMRandom rnd_;
@@ -140,11 +309,11 @@
: public SADTestBase,
public ::testing::WithParamInterface<SadMxNx4Param> {
public:
- SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+ SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
protected:
void SADs(unsigned int *results) {
- const uint8_t* refs[] = {GetReference(0), GetReference(1),
+ const uint8_t *refs[] = {GetReference(0), GetReference(1),
GetReference(2), GetReference(3)};
ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(source_data_, source_stride_,
@@ -169,12 +338,12 @@
: public SADTestBase,
public ::testing::WithParamInterface<SadMxNParam> {
public:
- SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+ SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
protected:
unsigned int SAD(unsigned int max_sad, int block_idx) {
unsigned int ret;
- const uint8_t* const reference = GetReference(block_idx);
+ const uint8_t *const reference = GetReference(block_idx);
ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
reference, reference_stride_,
@@ -201,12 +370,12 @@
: public SADTestBase,
public ::testing::WithParamInterface<SadMxNVp9Param> {
public:
- SADVP9Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+ SADVP9Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
protected:
unsigned int SAD(int block_idx) {
unsigned int ret;
- const uint8_t* const reference = GetReference(block_idx);
+ const uint8_t *const reference = GetReference(block_idx);
ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
reference, reference_stride_));
@@ -220,20 +389,54 @@
ASSERT_EQ(reference_sad, exp_sad);
}
};
+
+class SADavgVP9Test
+ : public SADTestBase,
+ public ::testing::WithParamInterface<SadMxNAvgVp9Param> {
+ public:
+ SADavgVP9Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+ unsigned int SAD_avg(int block_idx) {
+ unsigned int ret;
+ const uint8_t *const reference = GetReference(block_idx);
+
+ ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+ reference, reference_stride_,
+ second_pred_));
+ return ret;
+ }
+
+ void CheckSAD() {
+ const unsigned int reference_sad = ReferenceSADavg(UINT_MAX, 0);
+ const unsigned int exp_sad = SAD_avg(0);
+
+ ASSERT_EQ(reference_sad, exp_sad);
+ }
+};
#endif // CONFIG_VP9_ENCODER
-uint8_t* SADTestBase::source_data_ = NULL;
-uint8_t* SADTestBase::reference_data_ = NULL;
+uint8_t *SADTestBase::source_data_ = NULL;
+uint8_t *SADTestBase::reference_data_ = NULL;
+uint8_t *SADTestBase::second_pred_ = NULL;
+#if CONFIG_VP9_ENCODER && CONFIG_VP9_HIGHBITDEPTH
+uint8_t *SADTestBase::source_data8_ = NULL;
+uint8_t *SADTestBase::reference_data8_ = NULL;
+uint8_t *SADTestBase::second_pred8_ = NULL;
+uint16_t *SADTestBase::source_data16_ = NULL;
+uint16_t *SADTestBase::reference_data16_ = NULL;
+uint16_t *SADTestBase::second_pred16_ = NULL;
+#endif
#if CONFIG_VP8_ENCODER
TEST_P(SADTest, MaxRef) {
FillConstant(source_data_, source_stride_, 0);
- FillConstant(reference_data_, reference_stride_, 255);
+ FillConstant(reference_data_, reference_stride_, mask_);
CheckSAD(UINT_MAX);
}
TEST_P(SADTest, MaxSrc) {
- FillConstant(source_data_, source_stride_, 255);
+ FillConstant(source_data_, source_stride_, mask_);
FillConstant(reference_data_, reference_stride_, 0);
CheckSAD(UINT_MAX);
}
@@ -270,7 +473,7 @@
TEST_P(SADTest, MaxSAD) {
// Verify that, when max_sad is set, the implementation does not return a
// value lower than the reference.
- FillConstant(source_data_, source_stride_, 255);
+ FillConstant(source_data_, source_stride_, mask_);
FillConstant(reference_data_, reference_stride_, 0);
CheckSAD(128);
}
@@ -279,12 +482,12 @@
#if CONFIG_VP9_ENCODER
TEST_P(SADVP9Test, MaxRef) {
FillConstant(source_data_, source_stride_, 0);
- FillConstant(reference_data_, reference_stride_, 255);
+ FillConstant(reference_data_, reference_stride_, mask_);
CheckSAD();
}
TEST_P(SADVP9Test, MaxSrc) {
- FillConstant(source_data_, source_stride_, 255);
+ FillConstant(source_data_, source_stride_, mask_);
FillConstant(reference_data_, reference_stride_, 0);
CheckSAD();
}
@@ -317,19 +520,64 @@
CheckSAD();
source_stride_ = tmp_stride;
}
+
+TEST_P(SADavgVP9Test, MaxRef) {
+ FillConstant(source_data_, source_stride_, 0);
+ FillConstant(reference_data_, reference_stride_, mask_);
+ FillConstant(second_pred_, width_, 0);
+ CheckSAD();
+}
+TEST_P(SADavgVP9Test, MaxSrc) {
+ FillConstant(source_data_, source_stride_, mask_);
+ FillConstant(reference_data_, reference_stride_, 0);
+ FillConstant(second_pred_, width_, 0);
+ CheckSAD();
+}
+
+TEST_P(SADavgVP9Test, ShortRef) {
+ const int tmp_stride = reference_stride_;
+ reference_stride_ >>= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ FillRandom(second_pred_, width_);
+ CheckSAD();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADavgVP9Test, UnalignedRef) {
+ // The reference frame, but not the source frame, may be unaligned for
+ // certain types of searches.
+ const int tmp_stride = reference_stride_;
+ reference_stride_ -= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ FillRandom(second_pred_, width_);
+ CheckSAD();
+ reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADavgVP9Test, ShortSrc) {
+ const int tmp_stride = source_stride_;
+ source_stride_ >>= 1;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(reference_data_, reference_stride_);
+ FillRandom(second_pred_, width_);
+ CheckSAD();
+ source_stride_ = tmp_stride;
+}
#endif // CONFIG_VP9_ENCODER
TEST_P(SADx4Test, MaxRef) {
FillConstant(source_data_, source_stride_, 0);
- FillConstant(GetReference(0), reference_stride_, 255);
- FillConstant(GetReference(1), reference_stride_, 255);
- FillConstant(GetReference(2), reference_stride_, 255);
- FillConstant(GetReference(3), reference_stride_, 255);
+ FillConstant(GetReference(0), reference_stride_, mask_);
+ FillConstant(GetReference(1), reference_stride_, mask_);
+ FillConstant(GetReference(2), reference_stride_, mask_);
+ FillConstant(GetReference(3), reference_stride_, mask_);
CheckSADs();
}
TEST_P(SADx4Test, MaxSrc) {
- FillConstant(source_data_, source_stride_, 255);
+ FillConstant(source_data_, source_stride_, mask_);
FillConstant(GetReference(0), reference_stride_, 0);
FillConstant(GetReference(1), reference_stride_, 0);
FillConstant(GetReference(2), reference_stride_, 0);
@@ -375,6 +623,18 @@
source_stride_ = tmp_stride;
}
+TEST_P(SADx4Test, SrcAlignedByWidth) {
+ uint8_t * tmp_source_data = source_data_;
+ source_data_ += width_;
+ FillRandom(source_data_, source_stride_);
+ FillRandom(GetReference(0), reference_stride_);
+ FillRandom(GetReference(1), reference_stride_);
+ FillRandom(GetReference(2), reference_stride_);
+ FillRandom(GetReference(3), reference_stride_);
+ CheckSADs();
+ source_data_ = tmp_source_data;
+}
+
using std::tr1::make_tuple;
//------------------------------------------------------------------------------
@@ -386,11 +646,11 @@
const SadMxNFunc sad_8x8_c = vp8_sad8x8_c;
const SadMxNFunc sad_4x4_c = vp8_sad4x4_c;
const SadMxNParam c_tests[] = {
- make_tuple(16, 16, sad_16x16_c),
- make_tuple(8, 16, sad_8x16_c),
- make_tuple(16, 8, sad_16x8_c),
- make_tuple(8, 8, sad_8x8_c),
- make_tuple(4, 4, sad_4x4_c),
+ make_tuple(16, 16, sad_16x16_c, -1),
+ make_tuple(8, 16, sad_8x16_c, -1),
+ make_tuple(16, 8, sad_16x8_c, -1),
+ make_tuple(8, 8, sad_8x8_c, -1),
+ make_tuple(4, 4, sad_4x4_c, -1),
};
INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests));
#endif // CONFIG_VP8_ENCODER
@@ -406,15 +666,15 @@
const SadMxNVp9Func sad_4x8_c_vp9 = vp9_sad4x8_c;
const SadMxNVp9Func sad_4x4_c_vp9 = vp9_sad4x4_c;
const SadMxNVp9Param c_vp9_tests[] = {
- make_tuple(64, 64, sad_64x64_c_vp9),
- make_tuple(32, 32, sad_32x32_c_vp9),
- make_tuple(16, 16, sad_16x16_c_vp9),
- make_tuple(8, 16, sad_8x16_c_vp9),
- make_tuple(16, 8, sad_16x8_c_vp9),
- make_tuple(8, 8, sad_8x8_c_vp9),
- make_tuple(8, 4, sad_8x4_c_vp9),
- make_tuple(4, 8, sad_4x8_c_vp9),
- make_tuple(4, 4, sad_4x4_c_vp9),
+ make_tuple(64, 64, sad_64x64_c_vp9, -1),
+ make_tuple(32, 32, sad_32x32_c_vp9, -1),
+ make_tuple(16, 16, sad_16x16_c_vp9, -1),
+ make_tuple(8, 16, sad_8x16_c_vp9, -1),
+ make_tuple(16, 8, sad_16x8_c_vp9, -1),
+ make_tuple(8, 8, sad_8x8_c_vp9, -1),
+ make_tuple(8, 4, sad_8x4_c_vp9, -1),
+ make_tuple(4, 8, sad_4x8_c_vp9, -1),
+ make_tuple(4, 4, sad_4x4_c_vp9, -1),
};
INSTANTIATE_TEST_CASE_P(C, SADVP9Test, ::testing::ValuesIn(c_vp9_tests));
@@ -432,19 +692,186 @@
const SadMxNx4Func sad_4x8x4d_c = vp9_sad4x8x4d_c;
const SadMxNx4Func sad_4x4x4d_c = vp9_sad4x4x4d_c;
INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::Values(
- make_tuple(64, 64, sad_64x64x4d_c),
- make_tuple(64, 32, sad_64x32x4d_c),
- make_tuple(32, 64, sad_32x64x4d_c),
- make_tuple(32, 32, sad_32x32x4d_c),
- make_tuple(32, 16, sad_32x16x4d_c),
- make_tuple(16, 32, sad_16x32x4d_c),
- make_tuple(16, 16, sad_16x16x4d_c),
- make_tuple(16, 8, sad_16x8x4d_c),
- make_tuple(8, 16, sad_8x16x4d_c),
- make_tuple(8, 8, sad_8x8x4d_c),
- make_tuple(8, 4, sad_8x4x4d_c),
- make_tuple(4, 8, sad_4x8x4d_c),
- make_tuple(4, 4, sad_4x4x4d_c)));
+ make_tuple(64, 64, sad_64x64x4d_c, -1),
+ make_tuple(64, 32, sad_64x32x4d_c, -1),
+ make_tuple(32, 64, sad_32x64x4d_c, -1),
+ make_tuple(32, 32, sad_32x32x4d_c, -1),
+ make_tuple(32, 16, sad_32x16x4d_c, -1),
+ make_tuple(16, 32, sad_16x32x4d_c, -1),
+ make_tuple(16, 16, sad_16x16x4d_c, -1),
+ make_tuple(16, 8, sad_16x8x4d_c, -1),
+ make_tuple(8, 16, sad_8x16x4d_c, -1),
+ make_tuple(8, 8, sad_8x8x4d_c, -1),
+ make_tuple(8, 4, sad_8x4x4d_c, -1),
+ make_tuple(4, 8, sad_4x8x4d_c, -1),
+ make_tuple(4, 4, sad_4x4x4d_c, -1)));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+const SadMxNVp9Func highbd_sad_64x64_c_vp9 = vp9_highbd_sad64x64_c;
+const SadMxNVp9Func highbd_sad_32x32_c_vp9 = vp9_highbd_sad32x32_c;
+const SadMxNVp9Func highbd_sad_16x16_c_vp9 = vp9_highbd_sad16x16_c;
+const SadMxNVp9Func highbd_sad_8x16_c_vp9 = vp9_highbd_sad8x16_c;
+const SadMxNVp9Func highbd_sad_16x8_c_vp9 = vp9_highbd_sad16x8_c;
+const SadMxNVp9Func highbd_sad_8x8_c_vp9 = vp9_highbd_sad8x8_c;
+const SadMxNVp9Func highbd_sad_8x4_c_vp9 = vp9_highbd_sad8x4_c;
+const SadMxNVp9Func highbd_sad_4x8_c_vp9 = vp9_highbd_sad4x8_c;
+const SadMxNVp9Func highbd_sad_4x4_c_vp9 = vp9_highbd_sad4x4_c;
+const SadMxNVp9Param c_vp9_highbd_8_tests[] = {
+ make_tuple(64, 64, highbd_sad_64x64_c_vp9, 8),
+ make_tuple(32, 32, highbd_sad_32x32_c_vp9, 8),
+ make_tuple(16, 16, highbd_sad_16x16_c_vp9, 8),
+ make_tuple(8, 16, highbd_sad_8x16_c_vp9, 8),
+ make_tuple(16, 8, highbd_sad_16x8_c_vp9, 8),
+ make_tuple(8, 8, highbd_sad_8x8_c_vp9, 8),
+ make_tuple(8, 4, highbd_sad_8x4_c_vp9, 8),
+ make_tuple(4, 8, highbd_sad_4x8_c_vp9, 8),
+ make_tuple(4, 4, highbd_sad_4x4_c_vp9, 8),
+};
+INSTANTIATE_TEST_CASE_P(C_8, SADVP9Test,
+ ::testing::ValuesIn(c_vp9_highbd_8_tests));
+
+const SadMxNVp9Param c_vp9_highbd_10_tests[] = {
+ make_tuple(64, 64, highbd_sad_64x64_c_vp9, 10),
+ make_tuple(32, 32, highbd_sad_32x32_c_vp9, 10),
+ make_tuple(16, 16, highbd_sad_16x16_c_vp9, 10),
+ make_tuple(8, 16, highbd_sad_8x16_c_vp9, 10),
+ make_tuple(16, 8, highbd_sad_16x8_c_vp9, 10),
+ make_tuple(8, 8, highbd_sad_8x8_c_vp9, 10),
+ make_tuple(8, 4, highbd_sad_8x4_c_vp9, 10),
+ make_tuple(4, 8, highbd_sad_4x8_c_vp9, 10),
+ make_tuple(4, 4, highbd_sad_4x4_c_vp9, 10),
+};
+INSTANTIATE_TEST_CASE_P(C_10, SADVP9Test,
+ ::testing::ValuesIn(c_vp9_highbd_10_tests));
+
+const SadMxNVp9Param c_vp9_highbd_12_tests[] = {
+ make_tuple(64, 64, highbd_sad_64x64_c_vp9, 12),
+ make_tuple(32, 32, highbd_sad_32x32_c_vp9, 12),
+ make_tuple(16, 16, highbd_sad_16x16_c_vp9, 12),
+ make_tuple(8, 16, highbd_sad_8x16_c_vp9, 12),
+ make_tuple(16, 8, highbd_sad_16x8_c_vp9, 12),
+ make_tuple(8, 8, highbd_sad_8x8_c_vp9, 12),
+ make_tuple(8, 4, highbd_sad_8x4_c_vp9, 12),
+ make_tuple(4, 8, highbd_sad_4x8_c_vp9, 12),
+ make_tuple(4, 4, highbd_sad_4x4_c_vp9, 12),
+};
+INSTANTIATE_TEST_CASE_P(C_12, SADVP9Test,
+ ::testing::ValuesIn(c_vp9_highbd_12_tests));
+
+const SadMxNAvgVp9Func highbd_sad8x4_avg_c_vp9 = vp9_highbd_sad8x4_avg_c;
+const SadMxNAvgVp9Func highbd_sad8x8_avg_c_vp9 = vp9_highbd_sad8x8_avg_c;
+const SadMxNAvgVp9Func highbd_sad8x16_avg_c_vp9 = vp9_highbd_sad8x16_avg_c;
+const SadMxNAvgVp9Func highbd_sad16x8_avg_c_vp9 = vp9_highbd_sad16x8_avg_c;
+const SadMxNAvgVp9Func highbd_sad16x16_avg_c_vp9 = vp9_highbd_sad16x16_avg_c;
+const SadMxNAvgVp9Func highbd_sad16x32_avg_c_vp9 = vp9_highbd_sad16x32_avg_c;
+const SadMxNAvgVp9Func highbd_sad32x16_avg_c_vp9 = vp9_highbd_sad32x16_avg_c;
+const SadMxNAvgVp9Func highbd_sad32x32_avg_c_vp9 = vp9_highbd_sad32x32_avg_c;
+const SadMxNAvgVp9Func highbd_sad32x64_avg_c_vp9 = vp9_highbd_sad32x64_avg_c;
+const SadMxNAvgVp9Func highbd_sad64x32_avg_c_vp9 = vp9_highbd_sad64x32_avg_c;
+const SadMxNAvgVp9Func highbd_sad64x64_avg_c_vp9 = vp9_highbd_sad64x64_avg_c;
+SadMxNAvgVp9Param avg_c_vp9_highbd_8_tests[] = {
+ make_tuple(8, 4, highbd_sad8x4_avg_c_vp9, 8),
+ make_tuple(8, 8, highbd_sad8x8_avg_c_vp9, 8),
+ make_tuple(8, 16, highbd_sad8x16_avg_c_vp9, 8),
+ make_tuple(16, 8, highbd_sad16x8_avg_c_vp9, 8),
+ make_tuple(16, 16, highbd_sad16x16_avg_c_vp9, 8),
+ make_tuple(16, 32, highbd_sad16x32_avg_c_vp9, 8),
+ make_tuple(32, 16, highbd_sad32x16_avg_c_vp9, 8),
+ make_tuple(32, 32, highbd_sad32x32_avg_c_vp9, 8),
+ make_tuple(32, 64, highbd_sad32x64_avg_c_vp9, 8),
+ make_tuple(64, 32, highbd_sad64x32_avg_c_vp9, 8),
+ make_tuple(64, 64, highbd_sad64x64_avg_c_vp9, 8)};
+INSTANTIATE_TEST_CASE_P(C_8, SADavgVP9Test,
+ ::testing::ValuesIn(avg_c_vp9_highbd_8_tests));
+
+SadMxNAvgVp9Param avg_c_vp9_highbd_10_tests[] = {
+ make_tuple(8, 4, highbd_sad8x4_avg_c_vp9, 10),
+ make_tuple(8, 8, highbd_sad8x8_avg_c_vp9, 10),
+ make_tuple(8, 16, highbd_sad8x16_avg_c_vp9, 10),
+ make_tuple(16, 8, highbd_sad16x8_avg_c_vp9, 10),
+ make_tuple(16, 16, highbd_sad16x16_avg_c_vp9, 10),
+ make_tuple(16, 32, highbd_sad16x32_avg_c_vp9, 10),
+ make_tuple(32, 16, highbd_sad32x16_avg_c_vp9, 10),
+ make_tuple(32, 32, highbd_sad32x32_avg_c_vp9, 10),
+ make_tuple(32, 64, highbd_sad32x64_avg_c_vp9, 10),
+ make_tuple(64, 32, highbd_sad64x32_avg_c_vp9, 10),
+ make_tuple(64, 64, highbd_sad64x64_avg_c_vp9, 10)};
+INSTANTIATE_TEST_CASE_P(C_10, SADavgVP9Test,
+ ::testing::ValuesIn(avg_c_vp9_highbd_10_tests));
+
+SadMxNAvgVp9Param avg_c_vp9_highbd_12_tests[] = {
+ make_tuple(8, 4, highbd_sad8x4_avg_c_vp9, 12),
+ make_tuple(8, 8, highbd_sad8x8_avg_c_vp9, 12),
+ make_tuple(8, 16, highbd_sad8x16_avg_c_vp9, 12),
+ make_tuple(16, 8, highbd_sad16x8_avg_c_vp9, 12),
+ make_tuple(16, 16, highbd_sad16x16_avg_c_vp9, 12),
+ make_tuple(16, 32, highbd_sad16x32_avg_c_vp9, 12),
+ make_tuple(32, 16, highbd_sad32x16_avg_c_vp9, 12),
+ make_tuple(32, 32, highbd_sad32x32_avg_c_vp9, 12),
+ make_tuple(32, 64, highbd_sad32x64_avg_c_vp9, 12),
+ make_tuple(64, 32, highbd_sad64x32_avg_c_vp9, 12),
+ make_tuple(64, 64, highbd_sad64x64_avg_c_vp9, 12)};
+INSTANTIATE_TEST_CASE_P(C_12, SADavgVP9Test,
+ ::testing::ValuesIn(avg_c_vp9_highbd_12_tests));
+
+const SadMxNx4Func highbd_sad_64x64x4d_c = vp9_highbd_sad64x64x4d_c;
+const SadMxNx4Func highbd_sad_64x32x4d_c = vp9_highbd_sad64x32x4d_c;
+const SadMxNx4Func highbd_sad_32x64x4d_c = vp9_highbd_sad32x64x4d_c;
+const SadMxNx4Func highbd_sad_32x32x4d_c = vp9_highbd_sad32x32x4d_c;
+const SadMxNx4Func highbd_sad_32x16x4d_c = vp9_highbd_sad32x16x4d_c;
+const SadMxNx4Func highbd_sad_16x32x4d_c = vp9_highbd_sad16x32x4d_c;
+const SadMxNx4Func highbd_sad_16x16x4d_c = vp9_highbd_sad16x16x4d_c;
+const SadMxNx4Func highbd_sad_16x8x4d_c = vp9_highbd_sad16x8x4d_c;
+const SadMxNx4Func highbd_sad_8x16x4d_c = vp9_highbd_sad8x16x4d_c;
+const SadMxNx4Func highbd_sad_8x8x4d_c = vp9_highbd_sad8x8x4d_c;
+const SadMxNx4Func highbd_sad_8x4x4d_c = vp9_highbd_sad8x4x4d_c;
+const SadMxNx4Func highbd_sad_4x8x4d_c = vp9_highbd_sad4x8x4d_c;
+const SadMxNx4Func highbd_sad_4x4x4d_c = vp9_highbd_sad4x4x4d_c;
+INSTANTIATE_TEST_CASE_P(C_8, SADx4Test, ::testing::Values(
+ make_tuple(64, 64, highbd_sad_64x64x4d_c, 8),
+ make_tuple(64, 32, highbd_sad_64x32x4d_c, 8),
+ make_tuple(32, 64, highbd_sad_32x64x4d_c, 8),
+ make_tuple(32, 32, highbd_sad_32x32x4d_c, 8),
+ make_tuple(32, 16, highbd_sad_32x16x4d_c, 8),
+ make_tuple(16, 32, highbd_sad_16x32x4d_c, 8),
+ make_tuple(16, 16, highbd_sad_16x16x4d_c, 8),
+ make_tuple(16, 8, highbd_sad_16x8x4d_c, 8),
+ make_tuple(8, 16, highbd_sad_8x16x4d_c, 8),
+ make_tuple(8, 8, highbd_sad_8x8x4d_c, 8),
+ make_tuple(8, 4, highbd_sad_8x4x4d_c, 8),
+ make_tuple(4, 8, highbd_sad_4x8x4d_c, 8),
+ make_tuple(4, 4, highbd_sad_4x4x4d_c, 8)));
+
+INSTANTIATE_TEST_CASE_P(C_10, SADx4Test, ::testing::Values(
+ make_tuple(64, 64, highbd_sad_64x64x4d_c, 10),
+ make_tuple(64, 32, highbd_sad_64x32x4d_c, 10),
+ make_tuple(32, 64, highbd_sad_32x64x4d_c, 10),
+ make_tuple(32, 32, highbd_sad_32x32x4d_c, 10),
+ make_tuple(32, 16, highbd_sad_32x16x4d_c, 10),
+ make_tuple(16, 32, highbd_sad_16x32x4d_c, 10),
+ make_tuple(16, 16, highbd_sad_16x16x4d_c, 10),
+ make_tuple(16, 8, highbd_sad_16x8x4d_c, 10),
+ make_tuple(8, 16, highbd_sad_8x16x4d_c, 10),
+ make_tuple(8, 8, highbd_sad_8x8x4d_c, 10),
+ make_tuple(8, 4, highbd_sad_8x4x4d_c, 10),
+ make_tuple(4, 8, highbd_sad_4x8x4d_c, 10),
+ make_tuple(4, 4, highbd_sad_4x4x4d_c, 10)));
+
+INSTANTIATE_TEST_CASE_P(C_12, SADx4Test, ::testing::Values(
+ make_tuple(64, 64, highbd_sad_64x64x4d_c, 12),
+ make_tuple(64, 32, highbd_sad_64x32x4d_c, 12),
+ make_tuple(32, 64, highbd_sad_32x64x4d_c, 12),
+ make_tuple(32, 32, highbd_sad_32x32x4d_c, 12),
+ make_tuple(32, 16, highbd_sad_32x16x4d_c, 12),
+ make_tuple(16, 32, highbd_sad_16x32x4d_c, 12),
+ make_tuple(16, 16, highbd_sad_16x16x4d_c, 12),
+ make_tuple(16, 8, highbd_sad_16x8x4d_c, 12),
+ make_tuple(8, 16, highbd_sad_8x16x4d_c, 12),
+ make_tuple(8, 8, highbd_sad_8x8x4d_c, 12),
+ make_tuple(8, 4, highbd_sad_8x4x4d_c, 12),
+ make_tuple(4, 8, highbd_sad_4x8x4d_c, 12),
+ make_tuple(4, 4, highbd_sad_4x4x4d_c, 12)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // CONFIG_VP9_ENCODER
//------------------------------------------------------------------------------
@@ -453,7 +880,7 @@
#if CONFIG_VP8_ENCODER
const SadMxNFunc sad_16x16_armv6 = vp8_sad16x16_armv6;
INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::Values(
- make_tuple(16, 16, sad_16x16_armv6)));
+ make_tuple(16, 16, sad_16x16_armv6, -1)));
#endif // CONFIG_VP8_ENCODER
#endif // HAVE_MEDIA
@@ -465,11 +892,11 @@
const SadMxNFunc sad_8x8_neon = vp8_sad8x8_neon;
const SadMxNFunc sad_4x4_neon = vp8_sad4x4_neon;
INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::Values(
- make_tuple(16, 16, sad_16x16_neon),
- make_tuple(8, 16, sad_8x16_neon),
- make_tuple(16, 8, sad_16x8_neon),
- make_tuple(8, 8, sad_8x8_neon),
- make_tuple(4, 4, sad_4x4_neon)));
+ make_tuple(16, 16, sad_16x16_neon, -1),
+ make_tuple(8, 16, sad_8x16_neon, -1),
+ make_tuple(16, 8, sad_16x8_neon, -1),
+ make_tuple(8, 8, sad_8x8_neon, -1),
+ make_tuple(4, 4, sad_4x4_neon, -1)));
#endif // CONFIG_VP8_ENCODER
#if CONFIG_VP9_ENCODER
const SadMxNVp9Func sad_64x64_neon_vp9 = vp9_sad64x64_neon;
@@ -477,10 +904,10 @@
const SadMxNVp9Func sad_16x16_neon_vp9 = vp9_sad16x16_neon;
const SadMxNVp9Func sad_8x8_neon_vp9 = vp9_sad8x8_neon;
const SadMxNVp9Param neon_vp9_tests[] = {
- make_tuple(64, 64, sad_64x64_neon_vp9),
- make_tuple(32, 32, sad_32x32_neon_vp9),
- make_tuple(16, 16, sad_16x16_neon_vp9),
- make_tuple(8, 8, sad_8x8_neon_vp9),
+ make_tuple(64, 64, sad_64x64_neon_vp9, -1),
+ make_tuple(32, 32, sad_32x32_neon_vp9, -1),
+ make_tuple(16, 16, sad_16x16_neon_vp9, -1),
+ make_tuple(8, 8, sad_8x8_neon_vp9, -1),
};
INSTANTIATE_TEST_CASE_P(NEON, SADVP9Test, ::testing::ValuesIn(neon_vp9_tests));
#endif // CONFIG_VP9_ENCODER
@@ -496,11 +923,11 @@
const SadMxNFunc sad_8x8_mmx = vp8_sad8x8_mmx;
const SadMxNFunc sad_4x4_mmx = vp8_sad4x4_mmx;
const SadMxNParam mmx_tests[] = {
- make_tuple(16, 16, sad_16x16_mmx),
- make_tuple(8, 16, sad_8x16_mmx),
- make_tuple(16, 8, sad_16x8_mmx),
- make_tuple(8, 8, sad_8x8_mmx),
- make_tuple(4, 4, sad_4x4_mmx),
+ make_tuple(16, 16, sad_16x16_mmx, -1),
+ make_tuple(8, 16, sad_8x16_mmx, -1),
+ make_tuple(16, 8, sad_16x8_mmx, -1),
+ make_tuple(8, 8, sad_8x8_mmx, -1),
+ make_tuple(4, 4, sad_4x4_mmx, -1),
};
INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
#endif // CONFIG_VP8_ENCODER
@@ -513,14 +940,14 @@
const SadMxNVp9Func sad_4x4_sse_vp9 = vp9_sad4x4_sse;
const SadMxNVp9Func sad_4x8_sse_vp9 = vp9_sad4x8_sse;
INSTANTIATE_TEST_CASE_P(SSE, SADVP9Test, ::testing::Values(
- make_tuple(4, 4, sad_4x4_sse_vp9),
- make_tuple(4, 8, sad_4x8_sse_vp9)));
+ make_tuple(4, 4, sad_4x4_sse_vp9, -1),
+ make_tuple(4, 8, sad_4x8_sse_vp9, -1)));
const SadMxNx4Func sad_4x8x4d_sse = vp9_sad4x8x4d_sse;
const SadMxNx4Func sad_4x4x4d_sse = vp9_sad4x4x4d_sse;
INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values(
- make_tuple(4, 8, sad_4x8x4d_sse),
- make_tuple(4, 4, sad_4x4x4d_sse)));
+ make_tuple(4, 8, sad_4x8x4d_sse, -1),
+ make_tuple(4, 4, sad_4x4x4d_sse, -1)));
#endif // CONFIG_USE_X86INC
#endif // CONFIG_VP9_ENCODER
#endif // HAVE_SSE
@@ -533,11 +960,11 @@
const SadMxNFunc sad_8x8_wmt = vp8_sad8x8_wmt;
const SadMxNFunc sad_4x4_wmt = vp8_sad4x4_wmt;
const SadMxNParam sse2_tests[] = {
- make_tuple(16, 16, sad_16x16_wmt),
- make_tuple(8, 16, sad_8x16_wmt),
- make_tuple(16, 8, sad_16x8_wmt),
- make_tuple(8, 8, sad_8x8_wmt),
- make_tuple(4, 4, sad_4x4_wmt),
+ make_tuple(16, 16, sad_16x16_wmt, -1),
+ make_tuple(8, 16, sad_8x16_wmt, -1),
+ make_tuple(16, 8, sad_16x8_wmt, -1),
+ make_tuple(8, 8, sad_8x8_wmt, -1),
+ make_tuple(4, 4, sad_4x4_wmt, -1),
};
INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
#endif // CONFIG_VP8_ENCODER
@@ -555,20 +982,6 @@
const SadMxNVp9Func sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
const SadMxNVp9Func sad_8x8_sse2_vp9 = vp9_sad8x8_sse2;
const SadMxNVp9Func sad_8x4_sse2_vp9 = vp9_sad8x4_sse2;
-const SadMxNVp9Param sse2_vp9_tests[] = {
- make_tuple(64, 64, sad_64x64_sse2_vp9),
- make_tuple(64, 32, sad_64x32_sse2_vp9),
- make_tuple(32, 64, sad_32x64_sse2_vp9),
- make_tuple(32, 32, sad_32x32_sse2_vp9),
- make_tuple(32, 16, sad_32x16_sse2_vp9),
- make_tuple(16, 32, sad_16x32_sse2_vp9),
- make_tuple(16, 16, sad_16x16_sse2_vp9),
- make_tuple(16, 8, sad_16x8_sse2_vp9),
- make_tuple(8, 16, sad_8x16_sse2_vp9),
- make_tuple(8, 8, sad_8x8_sse2_vp9),
- make_tuple(8, 4, sad_8x4_sse2_vp9),
-};
-INSTANTIATE_TEST_CASE_P(SSE2, SADVP9Test, ::testing::ValuesIn(sse2_vp9_tests));
const SadMxNx4Func sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2;
const SadMxNx4Func sad_64x32x4d_sse2 = vp9_sad64x32x4d_sse2;
@@ -581,18 +994,214 @@
const SadMxNx4Func sad_8x16x4d_sse2 = vp9_sad8x16x4d_sse2;
const SadMxNx4Func sad_8x8x4d_sse2 = vp9_sad8x8x4d_sse2;
const SadMxNx4Func sad_8x4x4d_sse2 = vp9_sad8x4x4d_sse2;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+const SadMxNVp9Func highbd_sad8x4_sse2_vp9 = vp9_highbd_sad8x4_sse2;
+const SadMxNVp9Func highbd_sad8x8_sse2_vp9 = vp9_highbd_sad8x8_sse2;
+const SadMxNVp9Func highbd_sad8x16_sse2_vp9 = vp9_highbd_sad8x16_sse2;
+const SadMxNVp9Func highbd_sad16x8_sse2_vp9 = vp9_highbd_sad16x8_sse2;
+const SadMxNVp9Func highbd_sad16x16_sse2_vp9 = vp9_highbd_sad16x16_sse2;
+const SadMxNVp9Func highbd_sad16x32_sse2_vp9 = vp9_highbd_sad16x32_sse2;
+const SadMxNVp9Func highbd_sad32x16_sse2_vp9 = vp9_highbd_sad32x16_sse2;
+const SadMxNVp9Func highbd_sad32x32_sse2_vp9 = vp9_highbd_sad32x32_sse2;
+const SadMxNVp9Func highbd_sad32x64_sse2_vp9 = vp9_highbd_sad32x64_sse2;
+const SadMxNVp9Func highbd_sad64x32_sse2_vp9 = vp9_highbd_sad64x32_sse2;
+const SadMxNVp9Func highbd_sad64x64_sse2_vp9 = vp9_highbd_sad64x64_sse2;
+
+INSTANTIATE_TEST_CASE_P(SSE2, SADVP9Test, ::testing::Values(
+ make_tuple(64, 64, sad_64x64_sse2_vp9, -1),
+ make_tuple(64, 32, sad_64x32_sse2_vp9, -1),
+ make_tuple(32, 64, sad_32x64_sse2_vp9, -1),
+ make_tuple(32, 32, sad_32x32_sse2_vp9, -1),
+ make_tuple(32, 16, sad_32x16_sse2_vp9, -1),
+ make_tuple(16, 32, sad_16x32_sse2_vp9, -1),
+ make_tuple(16, 16, sad_16x16_sse2_vp9, -1),
+ make_tuple(16, 8, sad_16x8_sse2_vp9, -1),
+ make_tuple(8, 16, sad_8x16_sse2_vp9, -1),
+ make_tuple(8, 8, sad_8x8_sse2_vp9, -1),
+ make_tuple(8, 4, sad_8x4_sse2_vp9, -1),
+ make_tuple(8, 4, highbd_sad8x4_sse2_vp9, 8),
+ make_tuple(8, 8, highbd_sad8x8_sse2_vp9, 8),
+ make_tuple(8, 16, highbd_sad8x16_sse2_vp9, 8),
+ make_tuple(16, 8, highbd_sad16x8_sse2_vp9, 8),
+ make_tuple(16, 16, highbd_sad16x16_sse2_vp9, 8),
+ make_tuple(16, 32, highbd_sad16x32_sse2_vp9, 8),
+ make_tuple(32, 16, highbd_sad32x16_sse2_vp9, 8),
+ make_tuple(32, 32, highbd_sad32x32_sse2_vp9, 8),
+ make_tuple(32, 64, highbd_sad32x64_sse2_vp9, 8),
+ make_tuple(64, 32, highbd_sad64x32_sse2_vp9, 8),
+ make_tuple(64, 64, highbd_sad64x64_sse2_vp9, 8),
+ make_tuple(8, 4, highbd_sad8x4_sse2_vp9, 10),
+ make_tuple(8, 8, highbd_sad8x8_sse2_vp9, 10),
+ make_tuple(8, 16, highbd_sad8x16_sse2_vp9, 10),
+ make_tuple(16, 8, highbd_sad16x8_sse2_vp9, 10),
+ make_tuple(16, 16, highbd_sad16x16_sse2_vp9, 10),
+ make_tuple(16, 32, highbd_sad16x32_sse2_vp9, 10),
+ make_tuple(32, 16, highbd_sad32x16_sse2_vp9, 10),
+ make_tuple(32, 32, highbd_sad32x32_sse2_vp9, 10),
+ make_tuple(32, 64, highbd_sad32x64_sse2_vp9, 10),
+ make_tuple(64, 32, highbd_sad64x32_sse2_vp9, 10),
+ make_tuple(64, 64, highbd_sad64x64_sse2_vp9, 10),
+ make_tuple(8, 4, highbd_sad8x4_sse2_vp9, 12),
+ make_tuple(8, 8, highbd_sad8x8_sse2_vp9, 12),
+ make_tuple(8, 16, highbd_sad8x16_sse2_vp9, 12),
+ make_tuple(16, 8, highbd_sad16x8_sse2_vp9, 12),
+ make_tuple(16, 16, highbd_sad16x16_sse2_vp9, 12),
+ make_tuple(16, 32, highbd_sad16x32_sse2_vp9, 12),
+ make_tuple(32, 16, highbd_sad32x16_sse2_vp9, 12),
+ make_tuple(32, 32, highbd_sad32x32_sse2_vp9, 12),
+ make_tuple(32, 64, highbd_sad32x64_sse2_vp9, 12),
+ make_tuple(64, 32, highbd_sad64x32_sse2_vp9, 12),
+ make_tuple(64, 64, highbd_sad64x64_sse2_vp9, 12)));
+
+const SadMxNAvgVp9Func highbd_sad8x4_avg_sse2_vp9 = vp9_highbd_sad8x4_avg_sse2;
+const SadMxNAvgVp9Func highbd_sad8x8_avg_sse2_vp9 = vp9_highbd_sad8x8_avg_sse2;
+const SadMxNAvgVp9Func highbd_sad8x16_avg_sse2_vp9 =
+ vp9_highbd_sad8x16_avg_sse2;
+const SadMxNAvgVp9Func highbd_sad16x8_avg_sse2_vp9 =
+ vp9_highbd_sad16x8_avg_sse2;
+const SadMxNAvgVp9Func highbd_sad16x16_avg_sse2_vp9 =
+ vp9_highbd_sad16x16_avg_sse2;
+const SadMxNAvgVp9Func highbd_sad16x32_avg_sse2_vp9 =
+ vp9_highbd_sad16x32_avg_sse2;
+const SadMxNAvgVp9Func highbd_sad32x16_avg_sse2_vp9 =
+ vp9_highbd_sad32x16_avg_sse2;
+const SadMxNAvgVp9Func highbd_sad32x32_avg_sse2_vp9 =
+ vp9_highbd_sad32x32_avg_sse2;
+const SadMxNAvgVp9Func highbd_sad32x64_avg_sse2_vp9 =
+ vp9_highbd_sad32x64_avg_sse2;
+const SadMxNAvgVp9Func highbd_sad64x32_avg_sse2_vp9 =
+ vp9_highbd_sad64x32_avg_sse2;
+const SadMxNAvgVp9Func highbd_sad64x64_avg_sse2_vp9 =
+ vp9_highbd_sad64x64_avg_sse2;
+
+INSTANTIATE_TEST_CASE_P(SSE2, SADavgVP9Test, ::testing::Values(
+ make_tuple(8, 4, highbd_sad8x4_avg_sse2_vp9, 8),
+ make_tuple(8, 8, highbd_sad8x8_avg_sse2_vp9, 8),
+ make_tuple(8, 16, highbd_sad8x16_avg_sse2_vp9, 8),
+ make_tuple(16, 8, highbd_sad16x8_avg_sse2_vp9, 8),
+ make_tuple(16, 16, highbd_sad16x16_avg_sse2_vp9, 8),
+ make_tuple(16, 32, highbd_sad16x32_avg_sse2_vp9, 8),
+ make_tuple(32, 16, highbd_sad32x16_avg_sse2_vp9, 8),
+ make_tuple(32, 32, highbd_sad32x32_avg_sse2_vp9, 8),
+ make_tuple(32, 64, highbd_sad32x64_avg_sse2_vp9, 8),
+ make_tuple(64, 32, highbd_sad64x32_avg_sse2_vp9, 8),
+ make_tuple(64, 64, highbd_sad64x64_avg_sse2_vp9, 8),
+ make_tuple(8, 4, highbd_sad8x4_avg_sse2_vp9, 10),
+ make_tuple(8, 8, highbd_sad8x8_avg_sse2_vp9, 10),
+ make_tuple(8, 16, highbd_sad8x16_avg_sse2_vp9, 10),
+ make_tuple(16, 8, highbd_sad16x8_avg_sse2_vp9, 10),
+ make_tuple(16, 16, highbd_sad16x16_avg_sse2_vp9, 10),
+ make_tuple(16, 32, highbd_sad16x32_avg_sse2_vp9, 10),
+ make_tuple(32, 16, highbd_sad32x16_avg_sse2_vp9, 10),
+ make_tuple(32, 32, highbd_sad32x32_avg_sse2_vp9, 10),
+ make_tuple(32, 64, highbd_sad32x64_avg_sse2_vp9, 10),
+ make_tuple(64, 32, highbd_sad64x32_avg_sse2_vp9, 10),
+ make_tuple(64, 64, highbd_sad64x64_avg_sse2_vp9, 10),
+ make_tuple(8, 4, highbd_sad8x4_avg_sse2_vp9, 12),
+ make_tuple(8, 8, highbd_sad8x8_avg_sse2_vp9, 12),
+ make_tuple(8, 16, highbd_sad8x16_avg_sse2_vp9, 12),
+ make_tuple(16, 8, highbd_sad16x8_avg_sse2_vp9, 12),
+ make_tuple(16, 16, highbd_sad16x16_avg_sse2_vp9, 12),
+ make_tuple(16, 32, highbd_sad16x32_avg_sse2_vp9, 12),
+ make_tuple(32, 16, highbd_sad32x16_avg_sse2_vp9, 12),
+ make_tuple(32, 32, highbd_sad32x32_avg_sse2_vp9, 12),
+ make_tuple(32, 64, highbd_sad32x64_avg_sse2_vp9, 12),
+ make_tuple(64, 32, highbd_sad64x32_avg_sse2_vp9, 12),
+ make_tuple(64, 64, highbd_sad64x64_avg_sse2_vp9, 12)));
+
+const SadMxNx4Func highbd_sad_64x64x4d_sse2 = vp9_highbd_sad64x64x4d_sse2;
+const SadMxNx4Func highbd_sad_64x32x4d_sse2 = vp9_highbd_sad64x32x4d_sse2;
+const SadMxNx4Func highbd_sad_32x64x4d_sse2 = vp9_highbd_sad32x64x4d_sse2;
+const SadMxNx4Func highbd_sad_32x32x4d_sse2 = vp9_highbd_sad32x32x4d_sse2;
+const SadMxNx4Func highbd_sad_32x16x4d_sse2 = vp9_highbd_sad32x16x4d_sse2;
+const SadMxNx4Func highbd_sad_16x32x4d_sse2 = vp9_highbd_sad16x32x4d_sse2;
+const SadMxNx4Func highbd_sad_16x16x4d_sse2 = vp9_highbd_sad16x16x4d_sse2;
+const SadMxNx4Func highbd_sad_16x8x4d_sse2 = vp9_highbd_sad16x8x4d_sse2;
+const SadMxNx4Func highbd_sad_8x16x4d_sse2 = vp9_highbd_sad8x16x4d_sse2;
+const SadMxNx4Func highbd_sad_8x8x4d_sse2 = vp9_highbd_sad8x8x4d_sse2;
+const SadMxNx4Func highbd_sad_8x4x4d_sse2 = vp9_highbd_sad8x4x4d_sse2;
+const SadMxNx4Func highbd_sad_4x8x4d_sse2 = vp9_highbd_sad4x8x4d_sse2;
+const SadMxNx4Func highbd_sad_4x4x4d_sse2 = vp9_highbd_sad4x4x4d_sse2;
+
INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values(
- make_tuple(64, 64, sad_64x64x4d_sse2),
- make_tuple(64, 32, sad_64x32x4d_sse2),
- make_tuple(32, 64, sad_32x64x4d_sse2),
- make_tuple(32, 32, sad_32x32x4d_sse2),
- make_tuple(32, 16, sad_32x16x4d_sse2),
- make_tuple(16, 32, sad_16x32x4d_sse2),
- make_tuple(16, 16, sad_16x16x4d_sse2),
- make_tuple(16, 8, sad_16x8x4d_sse2),
- make_tuple(8, 16, sad_8x16x4d_sse2),
- make_tuple(8, 8, sad_8x8x4d_sse2),
- make_tuple(8, 4, sad_8x4x4d_sse2)));
+ make_tuple(64, 64, sad_64x64x4d_sse2, -1),
+ make_tuple(64, 32, sad_64x32x4d_sse2, -1),
+ make_tuple(32, 64, sad_32x64x4d_sse2, -1),
+ make_tuple(32, 32, sad_32x32x4d_sse2, -1),
+ make_tuple(32, 16, sad_32x16x4d_sse2, -1),
+ make_tuple(16, 32, sad_16x32x4d_sse2, -1),
+ make_tuple(16, 16, sad_16x16x4d_sse2, -1),
+ make_tuple(16, 8, sad_16x8x4d_sse2, -1),
+ make_tuple(8, 16, sad_8x16x4d_sse2, -1),
+ make_tuple(8, 8, sad_8x8x4d_sse2, -1),
+ make_tuple(8, 4, sad_8x4x4d_sse2, -1),
+ make_tuple(64, 64, highbd_sad_64x64x4d_sse2, 8),
+ make_tuple(64, 32, highbd_sad_64x32x4d_sse2, 8),
+ make_tuple(32, 64, highbd_sad_32x64x4d_sse2, 8),
+ make_tuple(32, 32, highbd_sad_32x32x4d_sse2, 8),
+ make_tuple(32, 16, highbd_sad_32x16x4d_sse2, 8),
+ make_tuple(16, 32, highbd_sad_16x32x4d_sse2, 8),
+ make_tuple(16, 16, highbd_sad_16x16x4d_sse2, 8),
+ make_tuple(16, 8, highbd_sad_16x8x4d_sse2, 8),
+ make_tuple(8, 16, highbd_sad_8x16x4d_sse2, 8),
+ make_tuple(8, 8, highbd_sad_8x8x4d_sse2, 8),
+ make_tuple(8, 4, highbd_sad_8x4x4d_sse2, 8),
+ make_tuple(4, 8, highbd_sad_4x8x4d_sse2, 8),
+ make_tuple(4, 4, highbd_sad_4x4x4d_sse2, 8),
+ make_tuple(64, 64, highbd_sad_64x64x4d_sse2, 10),
+ make_tuple(64, 32, highbd_sad_64x32x4d_sse2, 10),
+ make_tuple(32, 64, highbd_sad_32x64x4d_sse2, 10),
+ make_tuple(32, 32, highbd_sad_32x32x4d_sse2, 10),
+ make_tuple(32, 16, highbd_sad_32x16x4d_sse2, 10),
+ make_tuple(16, 32, highbd_sad_16x32x4d_sse2, 10),
+ make_tuple(16, 16, highbd_sad_16x16x4d_sse2, 10),
+ make_tuple(16, 8, highbd_sad_16x8x4d_sse2, 10),
+ make_tuple(8, 16, highbd_sad_8x16x4d_sse2, 10),
+ make_tuple(8, 8, highbd_sad_8x8x4d_sse2, 10),
+ make_tuple(8, 4, highbd_sad_8x4x4d_sse2, 10),
+ make_tuple(4, 8, highbd_sad_4x8x4d_sse2, 10),
+ make_tuple(4, 4, highbd_sad_4x4x4d_sse2, 10),
+ make_tuple(64, 64, highbd_sad_64x64x4d_sse2, 12),
+ make_tuple(64, 32, highbd_sad_64x32x4d_sse2, 12),
+ make_tuple(32, 64, highbd_sad_32x64x4d_sse2, 12),
+ make_tuple(32, 32, highbd_sad_32x32x4d_sse2, 12),
+ make_tuple(32, 16, highbd_sad_32x16x4d_sse2, 12),
+ make_tuple(16, 32, highbd_sad_16x32x4d_sse2, 12),
+ make_tuple(16, 16, highbd_sad_16x16x4d_sse2, 12),
+ make_tuple(16, 8, highbd_sad_16x8x4d_sse2, 12),
+ make_tuple(8, 16, highbd_sad_8x16x4d_sse2, 12),
+ make_tuple(8, 8, highbd_sad_8x8x4d_sse2, 12),
+ make_tuple(8, 4, highbd_sad_8x4x4d_sse2, 12),
+ make_tuple(4, 8, highbd_sad_4x8x4d_sse2, 12),
+ make_tuple(4, 4, highbd_sad_4x4x4d_sse2, 12)));
+#else
+INSTANTIATE_TEST_CASE_P(SSE2, SADVP9Test, ::testing::Values(
+ make_tuple(64, 64, sad_64x64_sse2_vp9, -1),
+ make_tuple(64, 32, sad_64x32_sse2_vp9, -1),
+ make_tuple(32, 64, sad_32x64_sse2_vp9, -1),
+ make_tuple(32, 32, sad_32x32_sse2_vp9, -1),
+ make_tuple(32, 16, sad_32x16_sse2_vp9, -1),
+ make_tuple(16, 32, sad_16x32_sse2_vp9, -1),
+ make_tuple(16, 16, sad_16x16_sse2_vp9, -1),
+ make_tuple(16, 8, sad_16x8_sse2_vp9, -1),
+ make_tuple(8, 16, sad_8x16_sse2_vp9, -1),
+ make_tuple(8, 8, sad_8x8_sse2_vp9, -1),
+ make_tuple(8, 4, sad_8x4_sse2_vp9, -1)));
+
+INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values(
+ make_tuple(64, 64, sad_64x64x4d_sse2, -1),
+ make_tuple(64, 32, sad_64x32x4d_sse2, -1),
+ make_tuple(32, 64, sad_32x64x4d_sse2, -1),
+ make_tuple(32, 32, sad_32x32x4d_sse2, -1),
+ make_tuple(32, 16, sad_32x16x4d_sse2, -1),
+ make_tuple(16, 32, sad_16x32x4d_sse2, -1),
+ make_tuple(16, 16, sad_16x16x4d_sse2, -1),
+ make_tuple(16, 8, sad_16x8x4d_sse2, -1),
+ make_tuple(8, 16, sad_8x16x4d_sse2, -1),
+ make_tuple(8, 8, sad_8x8x4d_sse2, -1),
+ make_tuple(8, 4, sad_8x4x4d_sse2, -1)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // CONFIG_USE_X86INC
#endif // CONFIG_VP9_ENCODER
#endif // HAVE_SSE2
@@ -605,11 +1214,11 @@
const SadMxNx4Func sad_8x8x4d_sse3 = vp8_sad8x8x4d_sse3;
const SadMxNx4Func sad_4x4x4d_sse3 = vp8_sad4x4x4d_sse3;
INSTANTIATE_TEST_CASE_P(SSE3, SADx4Test, ::testing::Values(
- make_tuple(16, 16, sad_16x16x4d_sse3),
- make_tuple(16, 8, sad_16x8x4d_sse3),
- make_tuple(8, 16, sad_8x16x4d_sse3),
- make_tuple(8, 8, sad_8x8x4d_sse3),
- make_tuple(4, 4, sad_4x4x4d_sse3)));
+ make_tuple(16, 16, sad_16x16x4d_sse3, -1),
+ make_tuple(16, 8, sad_16x8x4d_sse3, -1),
+ make_tuple(8, 16, sad_8x16x4d_sse3, -1),
+ make_tuple(8, 8, sad_8x8x4d_sse3, -1),
+ make_tuple(4, 4, sad_4x4x4d_sse3, -1)));
#endif // CONFIG_VP8_ENCODER
#endif // HAVE_SSE3
@@ -618,7 +1227,7 @@
#if CONFIG_VP8_ENCODER
const SadMxNFunc sad_16x16_sse3 = vp8_sad16x16_sse3;
INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
- make_tuple(16, 16, sad_16x16_sse3)));
+ make_tuple(16, 16, sad_16x16_sse3, -1)));
#endif // CONFIG_VP8_ENCODER
#endif // CONFIG_USE_X86INC
#endif // HAVE_SSSE3
@@ -625,25 +1234,11 @@
#if HAVE_AVX2
#if CONFIG_VP9_ENCODER
-const SadMxNVp9Func sad_64x64_avx2_vp9 = vp9_sad64x64_avx2;
-const SadMxNVp9Func sad_64x32_avx2_vp9 = vp9_sad64x32_avx2;
-const SadMxNVp9Func sad_32x64_avx2_vp9 = vp9_sad32x64_avx2;
-const SadMxNVp9Func sad_32x32_avx2_vp9 = vp9_sad32x32_avx2;
-const SadMxNVp9Func sad_32x16_avx2_vp9 = vp9_sad32x16_avx2;
-const SadMxNVp9Param avx2_vp9_tests[] = {
- make_tuple(64, 64, sad_64x64_avx2_vp9),
- make_tuple(64, 32, sad_64x32_avx2_vp9),
- make_tuple(32, 64, sad_32x64_avx2_vp9),
- make_tuple(32, 32, sad_32x32_avx2_vp9),
- make_tuple(32, 16, sad_32x16_avx2_vp9),
-};
-INSTANTIATE_TEST_CASE_P(AVX2, SADVP9Test, ::testing::ValuesIn(avx2_vp9_tests));
-
const SadMxNx4Func sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2;
const SadMxNx4Func sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2;
INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::Values(
- make_tuple(32, 32, sad_32x32x4d_avx2),
- make_tuple(64, 64, sad_64x64x4d_avx2)));
+ make_tuple(32, 32, sad_32x32x4d_avx2, -1),
+ make_tuple(64, 64, sad_64x64x4d_avx2, -1)));
#endif // CONFIG_VP9_ENCODER
#endif // HAVE_AVX2
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1652,37 +1652,37 @@
specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
add_proto qw/unsigned int vp9_highbd_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad64x64/;
+ specialize qw/vp9_highbd_sad64x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad32x64/;
+ specialize qw/vp9_highbd_sad32x64/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad64x32/;
+ specialize qw/vp9_highbd_sad64x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad32x16/;
+ specialize qw/vp9_highbd_sad32x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad16x32/;
+ specialize qw/vp9_highbd_sad16x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad32x32/;
+ specialize qw/vp9_highbd_sad32x32/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad16x16/;
+ specialize qw/vp9_highbd_sad16x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad16x8/;
+ specialize qw/vp9_highbd_sad16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad8x16/;
+ specialize qw/vp9_highbd_sad8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad8x8/;
+ specialize qw/vp9_highbd_sad8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
- specialize qw/vp9_highbd_sad8x4/;
+ specialize qw/vp9_highbd_sad8x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_highbd_sad4x8/;
@@ -1691,37 +1691,37 @@
specialize qw/vp9_highbd_sad4x4/;
add_proto qw/unsigned int vp9_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad64x64_avg/;
+ specialize qw/vp9_highbd_sad64x64_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad32x64_avg/;
+ specialize qw/vp9_highbd_sad32x64_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad64x32_avg/;
+ specialize qw/vp9_highbd_sad64x32_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad32x16_avg/;
+ specialize qw/vp9_highbd_sad32x16_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad16x32_avg/;
+ specialize qw/vp9_highbd_sad16x32_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad32x32_avg/;
+ specialize qw/vp9_highbd_sad32x32_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad16x16_avg/;
+ specialize qw/vp9_highbd_sad16x16_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad16x8_avg/;
+ specialize qw/vp9_highbd_sad16x8_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad8x16_avg/;
+ specialize qw/vp9_highbd_sad8x16_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad8x8_avg/;
+ specialize qw/vp9_highbd_sad8x8_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- specialize qw/vp9_highbd_sad8x4_avg/;
+ specialize qw/vp9_highbd_sad8x4_avg/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_highbd_sad4x8_avg/;
@@ -1778,44 +1778,43 @@
specialize qw/vp9_highbd_sad4x4x8/;
add_proto qw/void vp9_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad64x64x4d/;
+ specialize qw/vp9_highbd_sad64x64x4d sse2/;
add_proto qw/void vp9_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad32x64x4d/;
+ specialize qw/vp9_highbd_sad32x64x4d sse2/;
add_proto qw/void vp9_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad64x32x4d/;
+ specialize qw/vp9_highbd_sad64x32x4d sse2/;
add_proto qw/void vp9_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad32x16x4d/;
+ specialize qw/vp9_highbd_sad32x16x4d sse2/;
add_proto qw/void vp9_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad16x32x4d/;
+ specialize qw/vp9_highbd_sad16x32x4d sse2/;
add_proto qw/void vp9_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad32x32x4d/;
+ specialize qw/vp9_highbd_sad32x32x4d sse2/;
add_proto qw/void vp9_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad16x16x4d/;
+ specialize qw/vp9_highbd_sad16x16x4d sse2/;
add_proto qw/void vp9_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad16x8x4d/;
+ specialize qw/vp9_highbd_sad16x8x4d sse2/;
add_proto qw/void vp9_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad8x16x4d/;
+ specialize qw/vp9_highbd_sad8x16x4d sse2/;
add_proto qw/void vp9_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad8x8x4d/;
+ specialize qw/vp9_highbd_sad8x8x4d sse2/;
- # TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
add_proto qw/void vp9_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad8x4x4d/;
+ specialize qw/vp9_highbd_sad8x4x4d sse2/;
add_proto qw/void vp9_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad4x8x4d/;
+ specialize qw/vp9_highbd_sad4x8x4d sse2/;
add_proto qw/void vp9_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
- specialize qw/vp9_highbd_sad4x4x4d/;
+ specialize qw/vp9_highbd_sad4x4x4d sse2/;
add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vp9_highbd_mse16x16/;
--- /dev/null
+++ b/vp9/encoder/x86/vp9_highbd_sad4d_sse2.asm
@@ -1,0 +1,284 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_4x2x4 5-6 0
+ movh m0, [srcq +%2*2]
+%if %1 == 1
+ movu m4, [ref1q+%3*2]
+ movu m5, [ref2q+%3*2]
+ movu m6, [ref3q+%3*2]
+ movu m7, [ref4q+%3*2]
+ movhps m0, [srcq +%4*2]
+ movhps m4, [ref1q+%5*2]
+ movhps m5, [ref2q+%5*2]
+ movhps m6, [ref3q+%5*2]
+ movhps m7, [ref4q+%5*2]
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m4
+ psubusw m2, m5
+ psubusw m4, m0
+ psubusw m5, m0
+ por m4, m3
+ por m5, m2
+ pmaddwd m4, m1
+ pmaddwd m5, m1
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m6
+ psubusw m2, m7
+ psubusw m6, m0
+ psubusw m7, m0
+ por m6, m3
+ por m7, m2
+ pmaddwd m6, m1
+ pmaddwd m7, m1
+%else
+ movu m2, [ref1q+%3*2]
+ movhps m0, [srcq +%4*2]
+ movhps m2, [ref1q+%5*2]
+ mova m3, m0
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m4, m2
+
+ movu m2, [ref2q+%3*2]
+ mova m3, m0
+ movhps m2, [ref2q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m5, m2
+
+ movu m2, [ref3q+%3*2]
+ mova m3, m0
+ movhps m2, [ref3q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m6, m2
+
+ movu m2, [ref4q+%3*2]
+ mova m3, m0
+ movhps m2, [ref4q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endif
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*4]
+ lea ref1q, [ref1q+ref_strideq*4]
+ lea ref2q, [ref2q+ref_strideq*4]
+ lea ref3q, [ref3q+ref_strideq*4]
+ lea ref4q, [ref4q+ref_strideq*4]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_8x2x4 5-6 0
+ ; 1st 8 px
+ mova m0, [srcq +%2*2]
+%if %1 == 1
+ movu m4, [ref1q+%3*2]
+ movu m5, [ref2q+%3*2]
+ movu m6, [ref3q+%3*2]
+ movu m7, [ref4q+%3*2]
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m4
+ psubusw m2, m5
+ psubusw m4, m0
+ psubusw m5, m0
+ por m4, m3
+ por m5, m2
+ pmaddwd m4, m1
+ pmaddwd m5, m1
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m6
+ psubusw m2, m7
+ psubusw m6, m0
+ psubusw m7, m0
+ por m6, m3
+ por m7, m2
+ pmaddwd m6, m1
+ pmaddwd m7, m1
+%else
+ mova m3, m0
+ movu m2, [ref1q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m4, m2
+ movu m2, [ref2q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m5, m2
+ movu m2, [ref3q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m6, m2
+ movu m2, [ref4q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endif
+
+ ; 2nd 8 px
+ mova m0, [srcq +(%4)*2]
+ mova m3, m0
+ movu m2, [ref1q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m4, m2
+ movu m2, [ref2q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m5, m2
+ movu m2, [ref3q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m6, m2
+ movu m2, [ref4q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*4]
+ lea ref1q, [ref1q+ref_strideq*4]
+ lea ref2q, [ref2q+ref_strideq*4]
+ lea ref3q, [ref3q+ref_strideq*4]
+ lea ref4q, [ref4q+ref_strideq*4]
+%endif
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endmacro
+
+; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_16x2x4 5-6 0
+ HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
+ HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6
+%endmacro
+
+; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_32x2x4 5-6 0
+ HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
+ HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6
+%endmacro
+
+; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_64x2x4 5-6 0
+ HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
+ HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6
+%endmacro
+
+; void vp9_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref[4], int ref_stride,
+; unsigned int res[4]);
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
+%macro HIGH_SADNXN4D 2
+%if UNIX64
+cglobal highbd_sad%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4, one
+%else
+cglobal highbd_sad%1x%2x4d, 4, 8, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4, one
+%endif
+
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+ mov ref2q, [ref1q+gprsize*1]
+ mov ref3q, [ref1q+gprsize*2]
+ mov ref4q, [ref1q+gprsize*3]
+ mov ref1q, [ref1q+gprsize*0]
+
+; convert byte pointers to short pointers
+ shl srcq, 1
+ shl ref2q, 1
+ shl ref3q, 1
+ shl ref4q, 1
+ shl ref1q, 1
+
+ mov oned, 0x00010001
+ movd m1, oned
+ pshufd m1, m1, 0x0
+
+ HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%rep (%2-4)/2
+ HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+ HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+ ; N.B. HIGH_PROCESS outputs dwords (32 bits)
+ ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
+ movhlps m0, m4
+ movhlps m1, m5
+ movhlps m2, m6
+ movhlps m3, m7
+ paddd m4, m0
+ paddd m5, m1
+ paddd m6, m2
+ paddd m7, m3
+ punpckldq m4, m5
+ punpckldq m6, m7
+ movhlps m0, m4
+ movhlps m1, m6
+ paddd m4, m0
+ paddd m6, m1
+ punpcklqdq m4, m6
+ movifnidn r4, r4mp
+ movu [r4], m4
+ RET
+%endmacro
+
+
+INIT_XMM sse2
+HIGH_SADNXN4D 64, 64
+HIGH_SADNXN4D 64, 32
+HIGH_SADNXN4D 32, 64
+HIGH_SADNXN4D 32, 32
+HIGH_SADNXN4D 32, 16
+HIGH_SADNXN4D 16, 32
+HIGH_SADNXN4D 16, 16
+HIGH_SADNXN4D 16, 8
+HIGH_SADNXN4D 8, 16
+HIGH_SADNXN4D 8, 8
+HIGH_SADNXN4D 8, 4
+HIGH_SADNXN4D 4, 8
+HIGH_SADNXN4D 4, 4
--- /dev/null
+++ b/vp9/encoder/x86/vp9_highbd_sad_sse2.asm
@@ -1,0 +1,363 @@
+;
+; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro HIGH_SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%else ; avg
+%if %3 == 5
+cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
+ second_pred, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
+ ref, ref_stride, \
+ second_pred, \
+ src_stride3, ref_stride3
+%if ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; avg/sad
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+ lea src_stride3q, [src_strideq*3]
+ lea ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+; convert src, ref & second_pred to short ptrs (from byte ptrs)
+ shl srcq, 1
+ shl refq, 1
+%if %4 == 1
+ shl second_predq, 1
+%endif
+%endmacro
+
+; unsigned int vp9_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD64XN 1-2 0
+ HIGH_SAD_FN 64, %1, 5, %2
+ mov n_rowsd, %1
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ ; first half of each row
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+32]
+ psubusw m5, m3
+ psubusw m3, [srcq+32]
+ por m3, m5
+ mova m5, [srcq+48]
+ psubusw m5, m4
+ psubusw m4, [srcq+48]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ paddd m0, m1
+ paddd m0, m3
+ ; second half of each row
+ movu m1, [refq+64]
+ movu m2, [refq+80]
+ movu m3, [refq+96]
+ movu m4, [refq+112]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq+64]
+ psubusw m5, m1
+ psubusw m1, [srcq+64]
+ por m1, m5
+ mova m5, [srcq+80]
+ psubusw m5, m2
+ psubusw m2, [srcq+80]
+ por m2, m5
+ mova m5, [srcq+96]
+ psubusw m5, m3
+ psubusw m3, [srcq+96]
+ por m3, m5
+ mova m5, [srcq+112]
+ psubusw m5, m4
+ psubusw m4, [srcq+112]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*2]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*2]
+ paddd m0, m3
+
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
+HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
+HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
+HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+
+
+; unsigned int vp9_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD32XN 1-2 0
+ HIGH_SAD_FN 32, %1, 5, %2
+ mov n_rowsd, %1
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+32]
+ psubusw m5, m3
+ psubusw m3, [srcq+32]
+ por m3, m5
+ mova m5, [srcq+48]
+ psubusw m5, m4
+ psubusw m4, [srcq+48]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*2]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*2]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
+HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
+HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
+HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
+HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
+HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+
+; unsigned int vp9_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD16XN 1-2 0
+ HIGH_SAD_FN 16, %1, 5, %2
+ mov n_rowsd, %1/2
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+ref_strideq*2]
+ movu m4, [refq+ref_strideq*2+16]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+16]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*2+16]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+src_strideq*2]
+ psubusw m5, m3
+ psubusw m3, [srcq+src_strideq*2]
+ por m3, m5
+ mova m5, [srcq+src_strideq*2+16]
+ psubusw m5, m4
+ psubusw m4, [srcq+src_strideq*2+16]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
+HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
+HIGH_SAD16XN 8 ; highbd_sad16x8_sse2
+HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
+HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
+HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2
+
+
+; unsigned int vp9_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD8XN 1-2 0
+ HIGH_SAD_FN 8, %1, 7, %2
+ mov n_rowsd, %1/4
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+ref_strideq*2]
+ movu m3, [refq+ref_strideq*4]
+ movu m4, [refq+ref_stride3q*2]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+src_strideq*2]
+ psubusw m5, m2
+ psubusw m2, [srcq+src_strideq*2]
+ por m2, m5
+ mova m5, [srcq+src_strideq*4]
+ psubusw m5, m3
+ psubusw m3, [srcq+src_strideq*4]
+ por m3, m5
+ mova m5, [srcq+src_stride3q*2]
+ psubusw m5, m4
+ psubusw m4, [srcq+src_stride3q*2]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*8]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*8]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
+HIGH_SAD8XN 8 ; highbd_sad8x8_sse2
+HIGH_SAD8XN 4 ; highbd_sad8x4_sse2
+HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
+HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2
+HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -102,6 +102,9 @@
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad4d_sse2.asm
+endif
ifeq ($(CONFIG_USE_X86INC),yes)
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
@@ -110,6 +113,9 @@
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad_sse2.asm
+endif
endif
ifeq ($(ARCH_X86_64),yes)