ref: 9a480482cbc7f0d359d959bb2cfe097d0a672d6b
parent: 869d770610d0b32216279e66cfb58f5817460df2
parent: 1e6a32f1af8066fd0b718b11f00cb09104280f49
author: Ronald S. Bultje <[email protected]>
date: Fri Jun 21 08:49:43 EDT 2013
Merge "SSE2/SSSE3 optimizations and unit test for sub_pixel_avg_variance()."
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -76,6 +76,34 @@
return sse - (((int64_t) se * se) >> (l2w + l2h));
}
+static unsigned int subpel_avg_variance_ref(const uint8_t *ref,
+ const uint8_t *src,
+ const uint8_t *second_pred,
+ int l2w, int l2h,
+ int xoff, int yoff,
+ unsigned int *sse_ptr) {
+ int se = 0;
+ unsigned int sse = 0;
+ const int w = 1 << l2w, h = 1 << l2h;
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++) {
+ // bilinear interpolation at a 16th pel step
+ const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+ const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+ const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+ const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+ const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+ const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+ const int r = a + (((b - a) * yoff + 8) >> 4);
+ int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
+ se += diff;
+ sse += diff * diff;
+ }
+ }
+ *sse_ptr = sse;
+ return sse - (((int64_t) se * se) >> (l2w + l2h));
+}
+
template<typename VarianceFunctionType>
class VarianceTest :
public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
@@ -174,6 +202,7 @@
rnd(ACMRandom::DeterministicSeed());
block_size_ = width_ * height_;
src_ = new uint8_t[block_size_];
+ sec_ = new uint8_t[block_size_];
ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
ASSERT_TRUE(src_ != NULL);
ASSERT_TRUE(ref_ != NULL);
@@ -182,6 +211,7 @@
virtual void TearDown() {
delete[] src_;
delete[] ref_;
+ delete[] sec_;
}
protected:
@@ -188,8 +218,9 @@
void RefTest();
ACMRandom rnd;
- uint8_t* src_;
- uint8_t* ref_;
+ uint8_t *src_;
+ uint8_t *ref_;
+ uint8_t *sec_;
int width_, log2width_;
int height_, log2height_;
int block_size_;
@@ -217,6 +248,29 @@
}
}
+template<>
+void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {
+ for (int x = 0; x < 16; ++x) {
+ for (int y = 0; y < 16; ++y) {
+ for (int j = 0; j < block_size_; j++) {
+ src_[j] = rnd.Rand8();
+ sec_[j] = rnd.Rand8();
+ }
+ for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+ ref_[j] = rnd.Rand8();
+ }
+ unsigned int sse1, sse2;
+ const unsigned int var1 = subpel_variance_(ref_, width_ + 1, x, y,
+ src_, width_, &sse1, sec_);
+ const unsigned int var2 = subpel_avg_variance_ref(ref_, src_, sec_,
+ log2width_, log2height_,
+ x, y, &sse2);
+ EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+ EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+ }
+ }
+}
+
// -----------------------------------------------------------------------------
// VP8 test cases.
@@ -283,10 +337,12 @@
#if CONFIG_VP9_ENCODER
typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
+typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;
TEST_P(VP9VarianceTest, Zero) { ZeroTest(); }
TEST_P(VP9VarianceTest, Ref) { RefTest(); }
TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); }
TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); }
const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c;
@@ -360,6 +416,48 @@
make_tuple(6, 5, subpel_variance64x32_c),
make_tuple(6, 6, subpel_variance64x64_c)));
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c =
+ vp9_sub_pixel_avg_variance4x4_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_c =
+ vp9_sub_pixel_avg_variance4x8_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_c =
+ vp9_sub_pixel_avg_variance8x4_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_c =
+ vp9_sub_pixel_avg_variance8x8_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_c =
+ vp9_sub_pixel_avg_variance8x16_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_c =
+ vp9_sub_pixel_avg_variance16x8_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_c =
+ vp9_sub_pixel_avg_variance16x16_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_c =
+ vp9_sub_pixel_avg_variance16x32_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_c =
+ vp9_sub_pixel_avg_variance32x16_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_c =
+ vp9_sub_pixel_avg_variance32x32_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_c =
+ vp9_sub_pixel_avg_variance32x64_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_c =
+ vp9_sub_pixel_avg_variance64x32_c;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_c =
+ vp9_sub_pixel_avg_variance64x64_c;
+INSTANTIATE_TEST_CASE_P(
+ C, VP9SubpelAvgVarianceTest,
+ ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c),
+ make_tuple(2, 3, subpel_avg_variance4x8_c),
+ make_tuple(3, 2, subpel_avg_variance8x4_c),
+ make_tuple(3, 3, subpel_avg_variance8x8_c),
+ make_tuple(3, 4, subpel_avg_variance8x16_c),
+ make_tuple(4, 3, subpel_avg_variance16x8_c),
+ make_tuple(4, 4, subpel_avg_variance16x16_c),
+ make_tuple(4, 5, subpel_avg_variance16x32_c),
+ make_tuple(5, 4, subpel_avg_variance32x16_c),
+ make_tuple(5, 5, subpel_avg_variance32x32_c),
+ make_tuple(5, 6, subpel_avg_variance32x64_c),
+ make_tuple(6, 5, subpel_avg_variance64x32_c),
+ make_tuple(6, 6, subpel_avg_variance64x64_c)));
+
#if HAVE_MMX
const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx;
const vp9_variance_fn_t variance8x8_mmx = vp9_variance8x8_mmx;
@@ -446,6 +544,48 @@
make_tuple(5, 6, subpel_variance32x64_sse2),
make_tuple(6, 5, subpel_variance64x32_sse2),
make_tuple(6, 6, subpel_variance64x64_sse2)));
+
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_sse =
+ vp9_sub_pixel_avg_variance4x4_sse;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_sse =
+ vp9_sub_pixel_avg_variance4x8_sse;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_sse2 =
+ vp9_sub_pixel_avg_variance8x4_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_sse2 =
+ vp9_sub_pixel_avg_variance8x8_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_sse2 =
+ vp9_sub_pixel_avg_variance8x16_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_sse2 =
+ vp9_sub_pixel_avg_variance16x8_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_sse2 =
+ vp9_sub_pixel_avg_variance16x16_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_sse2 =
+ vp9_sub_pixel_avg_variance16x32_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_sse2 =
+ vp9_sub_pixel_avg_variance32x16_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_sse2 =
+ vp9_sub_pixel_avg_variance32x32_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_sse2 =
+ vp9_sub_pixel_avg_variance32x64_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_sse2 =
+ vp9_sub_pixel_avg_variance64x32_sse2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_sse2 =
+ vp9_sub_pixel_avg_variance64x64_sse2;
+INSTANTIATE_TEST_CASE_P(
+ SSE2, VP9SubpelAvgVarianceTest,
+ ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse),
+ make_tuple(2, 3, subpel_avg_variance4x8_sse),
+ make_tuple(3, 2, subpel_avg_variance8x4_sse2),
+ make_tuple(3, 3, subpel_avg_variance8x8_sse2),
+ make_tuple(3, 4, subpel_avg_variance8x16_sse2),
+ make_tuple(4, 3, subpel_avg_variance16x8_sse2),
+ make_tuple(4, 4, subpel_avg_variance16x16_sse2),
+ make_tuple(4, 5, subpel_avg_variance16x32_sse2),
+ make_tuple(5, 4, subpel_avg_variance32x16_sse2),
+ make_tuple(5, 5, subpel_avg_variance32x32_sse2),
+ make_tuple(5, 6, subpel_avg_variance32x64_sse2),
+ make_tuple(6, 5, subpel_avg_variance64x32_sse2),
+ make_tuple(6, 6, subpel_avg_variance64x64_sse2)));
#endif
#if HAVE_SSSE3
@@ -490,6 +630,48 @@
make_tuple(5, 6, subpel_variance32x64_ssse3),
make_tuple(6, 5, subpel_variance64x32_ssse3),
make_tuple(6, 6, subpel_variance64x64_ssse3)));
+
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_ssse3 =
+ vp9_sub_pixel_avg_variance4x4_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_ssse3 =
+ vp9_sub_pixel_avg_variance4x8_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_ssse3 =
+ vp9_sub_pixel_avg_variance8x4_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_ssse3 =
+ vp9_sub_pixel_avg_variance8x8_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_ssse3 =
+ vp9_sub_pixel_avg_variance8x16_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_ssse3 =
+ vp9_sub_pixel_avg_variance16x8_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_ssse3 =
+ vp9_sub_pixel_avg_variance16x16_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_ssse3 =
+ vp9_sub_pixel_avg_variance16x32_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_ssse3 =
+ vp9_sub_pixel_avg_variance32x16_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_ssse3 =
+ vp9_sub_pixel_avg_variance32x32_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_ssse3 =
+ vp9_sub_pixel_avg_variance32x64_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_ssse3 =
+ vp9_sub_pixel_avg_variance64x32_ssse3;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_ssse3 =
+ vp9_sub_pixel_avg_variance64x64_ssse3;
+INSTANTIATE_TEST_CASE_P(
+ SSSE3, VP9SubpelAvgVarianceTest,
+ ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3),
+ make_tuple(2, 3, subpel_avg_variance4x8_ssse3),
+ make_tuple(3, 2, subpel_avg_variance8x4_ssse3),
+ make_tuple(3, 3, subpel_avg_variance8x8_ssse3),
+ make_tuple(3, 4, subpel_avg_variance8x16_ssse3),
+ make_tuple(4, 3, subpel_avg_variance16x8_ssse3),
+ make_tuple(4, 4, subpel_avg_variance16x16_ssse3),
+ make_tuple(4, 5, subpel_avg_variance16x32_ssse3),
+ make_tuple(5, 4, subpel_avg_variance32x16_ssse3),
+ make_tuple(5, 5, subpel_avg_variance32x32_ssse3),
+ make_tuple(5, 6, subpel_avg_variance32x64_ssse3),
+ make_tuple(6, 5, subpel_avg_variance64x32_ssse3),
+ make_tuple(6, 6, subpel_avg_variance64x64_ssse3)));
#endif
#endif // CONFIG_VP9_ENCODER
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -269,61 +269,61 @@
specialize vp9_sub_pixel_variance64x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance64x64
+specialize vp9_sub_pixel_avg_variance64x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x64
+specialize vp9_sub_pixel_avg_variance32x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance64x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance64x32
+specialize vp9_sub_pixel_avg_variance64x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x16
+specialize vp9_sub_pixel_avg_variance32x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x32
+specialize vp9_sub_pixel_avg_variance16x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x32
+specialize vp9_sub_pixel_avg_variance32x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x16
+specialize vp9_sub_pixel_avg_variance16x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance8x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x16
+specialize vp9_sub_pixel_avg_variance8x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x8 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x8
+specialize vp9_sub_pixel_avg_variance16x8 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance8x8 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x8
+specialize vp9_sub_pixel_avg_variance8x8 sse2 ssse3
# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
@@ -330,13 +330,13 @@
specialize vp9_sub_pixel_variance8x4 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x4
+specialize vp9_sub_pixel_avg_variance8x4 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance4x8 sse ssse3
prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance4x8
+specialize vp9_sub_pixel_avg_variance4x8 sse ssse3
prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance4x4 sse ssse3
@@ -343,7 +343,7 @@
#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance4x4
+specialize vp9_sub_pixel_avg_variance4x4 sse ssse3
prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad64x64 sse2
--- a/vp9/encoder/x86/vp9_subpel_variance.asm
+++ b/vp9/encoder/x86/vp9_subpel_variance.asm
@@ -116,7 +116,7 @@
RET
%endmacro
-%macro SUBPEL_VARIANCE 1 ; W
+%macro SUBPEL_VARIANCE 1-2 0 ; W
%if cpuflag(ssse3)
%define bilin_filter_m bilin_filter_m_ssse3
%define filter_idx_shift 4
@@ -128,12 +128,38 @@
; 11, not 13, if the registers are ordered correctly. May make a minor speed
; difference on Win64
%ifdef PIC
+%if %2 == 1 ; avg
+cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, height, sse
+%define sec_str sec_strideq
+%else
cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \
dst, dst_stride, height, sse
+%endif
+%define h heightd
%define bilin_filter sseq
%else
+%if %2 == 1 ; avg
+cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+ 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, \
+ height, sse
+%if ARCH_X86_64
+%define h heightd
+%define sec_str sec_strideq
+%else
+%define h dword heightm
+%define sec_str sec_stridemp
+%endif
+%else
cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
dst, dst_stride, height, sse
+%define h heightd
+%endif
%define bilin_filter bilin_filter_m
%endif
ASSERT %1 <= 16 ; m6 overflows if w > 16
@@ -143,8 +169,11 @@
; could perhaps use it for something more productive then
pxor m5, m5 ; dedicated zero register
%if %1 < 16
- sar heightd, 1
+ sar h, 1
+%if %2 == 1 ; avg
+ shl sec_str, 1
%endif
+%endif
; FIXME(rbultje) replace by jumptable?
test x_offsetd, x_offsetd
@@ -158,30 +187,55 @@
%if %1 == 16
movu m0, [srcq]
mova m1, [dstq]
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%endif
punpckhbw m2, m0, m5
punpcklbw m0, m5
+%if %2 == 0 ; !avg
punpckhbw m3, m1, m5
punpcklbw m1, m5
+%endif
SUM_SSE m0, m1, m2, m3, m6, m7
add srcq, src_strideq
add dstq, dst_strideq
- dec heightd
%else ; %1 < 16
movh m0, [srcq]
+%if %2 == 1 ; avg
+%if mmsize == 16
+ movhps m0, [srcq+src_strideq]
+%else ; mmsize == 8
+ punpckldq m0, [srcq+src_strideq]
+%endif
+%else ; !avg
movh m2, [srcq+src_strideq]
+%endif
movh m1, [dstq]
movh m3, [dstq+dst_strideq]
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
punpcklbw m0, m5
+%else ; !avg
+ punpcklbw m0, m5
punpcklbw m2, m5
punpcklbw m3, m5
punpcklbw m1, m5
+%endif
SUM_SSE m0, m1, m2, m3, m6, m7
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
- dec heightd
%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
jg .x_zero_y_zero_loop
STORE_AND_RET
@@ -196,18 +250,40 @@
movu m4, [srcq+src_strideq]
mova m1, [dstq]
pavgb m0, m4
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
punpckhbw m3, m1, m5
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+%endif
punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
SUM_SSE m0, m1, m2, m3, m6, m7
add srcq, src_strideq
add dstq, dst_strideq
- dec heightd
%else ; %1 < 16
movh m0, [srcq]
movh m2, [srcq+src_strideq]
+%if %2 == 1 ; avg
+%if mmsize == 16
+ movhps m2, [srcq+src_strideq*2]
+%else ; mmsize == 8
+ punpckldq m2, [srcq+src_strideq*2]
+%endif
+ movh m1, [dstq]
+%if mmsize == 16
+ movlhps m0, m2
+%else ; mmsize == 8
+ punpckldq m0, m2
+%endif
+ movh m3, [dstq+dst_strideq]
+ pavgb m0, m2
+ punpcklbw m1, m5
+ pavgb m0, [secq]
+ punpcklbw m3, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; !avg
movh m4, [srcq+src_strideq*2]
movh m1, [dstq]
pavgb m0, m2
@@ -217,12 +293,16 @@
punpcklbw m2, m5
punpcklbw m3, m5
punpcklbw m1, m5
+%endif
SUM_SSE m0, m1, m2, m3, m6, m7
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
- dec heightd
%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
jg .x_zero_y_half_loop
STORE_AND_RET
@@ -280,6 +360,13 @@
%endif
psraw m2, 4
psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
punpckhbw m3, m1, m5
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
@@ -286,7 +373,6 @@
add srcq, src_strideq
add dstq, dst_strideq
- dec heightd
%else ; %1 < 16
movh m0, [srcq]
movh m2, [srcq+src_strideq]
@@ -318,13 +404,23 @@
%endif
psraw m0, 4
psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
- dec heightd
%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
jg .x_zero_y_other_loop
%undef filter_y_a
%undef filter_y_b
@@ -345,18 +441,37 @@
movu m4, [srcq+1]
mova m1, [dstq]
pavgb m0, m4
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
punpckhbw m3, m1, m5
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+%endif
punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
SUM_SSE m0, m1, m2, m3, m6, m7
add srcq, src_strideq
add dstq, dst_strideq
- dec heightd
%else ; %1 < 16
movh m0, [srcq]
movh m4, [srcq+1]
+%if %2 == 1 ; avg
+%if mmsize == 16
+ movhps m0, [srcq+src_strideq]
+ movhps m4, [srcq+src_strideq+1]
+%else ; mmsize == 8
+ punpckldq m0, [srcq+src_strideq]
+ punpckldq m4, [srcq+src_strideq+1]
+%endif
+ movh m1, [dstq]
+ movh m3, [dstq+dst_strideq]
+ pavgb m0, m4
+ punpcklbw m3, m5
+ pavgb m0, [secq]
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; !avg
movh m2, [srcq+src_strideq]
movh m1, [dstq]
pavgb m0, m4
@@ -367,12 +482,16 @@
punpcklbw m2, m5
punpcklbw m3, m5
punpcklbw m1, m5
+%endif
SUM_SSE m0, m1, m2, m3, m6, m7
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
- dec heightd
%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
jg .x_half_y_zero_loop
STORE_AND_RET
@@ -391,17 +510,23 @@
movu m3, [srcq+1]
mova m1, [dstq]
pavgb m4, m3
+ punpckhbw m3, m1, m5
pavgb m0, m4
+%if %2 == 1 ; avg
+ punpcklbw m1, m5
+ pavgb m0, [secq]
punpckhbw m2, m0, m5
punpcklbw m0, m5
- punpckhbw m3, m1, m5
+%else
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
punpcklbw m1, m5
+%endif
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
add srcq, src_strideq
add dstq, dst_strideq
- dec heightd
%else ; %1 < 16
movh m0, [srcq]
movh m3, [srcq+1]
@@ -410,6 +535,31 @@
.x_half_y_half_loop:
movh m2, [srcq]
movh m3, [srcq+1]
+%if %2 == 1 ; avg
+%if mmsize == 16
+ movhps m2, [srcq+src_strideq]
+ movhps m3, [srcq+src_strideq+1]
+%else
+ punpckldq m2, [srcq+src_strideq]
+ punpckldq m3, [srcq+src_strideq+1]
+%endif
+ pavgb m2, m3
+%if mmsize == 16
+ movlhps m0, m2
+ movhlps m4, m2
+%else ; mmsize == 8
+ punpckldq m0, m2
+ pshufw m4, m2, 0xe
+%endif
+ movh m1, [dstq]
+ pavgb m0, m2
+ movh m3, [dstq+dst_strideq]
+ pavgb m0, [secq]
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; !avg
movh m4, [srcq+src_strideq]
movh m1, [srcq+src_strideq+1]
pavgb m2, m3
@@ -422,13 +572,17 @@
punpcklbw m2, m5
punpcklbw m3, m5
punpcklbw m1, m5
+%endif
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
- dec heightd
%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
jg .x_half_y_half_loop
STORE_AND_RET
@@ -488,6 +642,13 @@
%endif
punpckhbw m3, m1, m5
psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
@@ -494,7 +655,6 @@
add srcq, src_strideq
add dstq, dst_strideq
- dec heightd
%else ; %1 < 16
movh m0, [srcq]
movh m3, [srcq+1]
@@ -536,6 +696,13 @@
%endif
psraw m0, 4
psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
@@ -542,8 +709,11 @@
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
- dec heightd
%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
jg .x_half_y_other_loop
%undef filter_y_a
%undef filter_y_b
@@ -602,6 +772,13 @@
%endif
psraw m2, 4
psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
punpckhbw m3, m1, m5
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
@@ -608,7 +785,6 @@
add srcq, src_strideq
add dstq, dst_strideq
- dec heightd
%else ; %1 < 16
movh m0, [srcq]
movh m1, [srcq+1]
@@ -642,13 +818,23 @@
%endif
psraw m0, 4
psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
- dec heightd
%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
jg .x_other_y_zero_loop
%undef filter_x_a
%undef filter_x_b
@@ -724,8 +910,6 @@
pavgb m0, m4
punpckhbw m3, m1, m5
punpcklbw m1, m5
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
%else
punpckhbw m2, m4, m5
punpckhbw m1, m3, m5
@@ -750,15 +934,18 @@
packuswb m4, m2
punpcklbw m1, m5
pavgb m0, m4
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ pavgb m0, [secq]
+%endif
punpckhbw m2, m0, m5
punpcklbw m0, m5
-%endif
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
add srcq, src_strideq
add dstq, dst_strideq
- dec heightd
%else ; %1 < 16
movh m0, [srcq]
movh m1, [srcq+1]
@@ -810,6 +997,13 @@
psraw m4, 4
pavgw m0, m2
pavgw m2, m4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline - also consider going to bytes here
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
punpcklbw m3, m5
punpcklbw m1, m5
SUM_SSE m0, m1, m2, m3, m6, m7
@@ -817,8 +1011,11 @@
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
- dec heightd
%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
jg .x_other_y_half_loop
%undef filter_x_a
%undef filter_x_b
@@ -942,12 +1139,18 @@
psraw m0, 4
punpcklbw m1, m5
%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
add srcq, src_strideq
add dstq, dst_strideq
- dec heightd
%else ; %1 < 16
movh m0, [srcq]
movh m1, [srcq+1]
@@ -1026,13 +1229,23 @@
punpcklbw m3, m5
punpcklbw m1, m5
%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
SUM_SSE m0, m1, m2, m3, m6, m7
mova m0, m4
lea srcq, [srcq+src_strideq*2]
lea dstq, [dstq+dst_strideq*2]
- dec heightd
%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
jg .x_other_y_other_loop
%undef filter_x_a
%undef filter_x_b
@@ -1059,3 +1272,15 @@
INIT_XMM ssse3
SUBPEL_VARIANCE 8
SUBPEL_VARIANCE 16
+
+INIT_MMX sse
+SUBPEL_VARIANCE 4, 1
+INIT_XMM sse2
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
+
+INIT_MMX ssse3
+SUBPEL_VARIANCE 4, 1
+INIT_XMM ssse3
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@@ -343,29 +343,22 @@
return (var - (((int64_t)avg * avg) >> 11));
}
+#define DECL(w, opt) \
+int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int x_offset, int y_offset, \
+ const uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ int height, unsigned int *sse)
#define DECLS(opt1, opt2) \
-int vp9_sub_pixel_variance4xh_##opt2(const uint8_t *src, \
- ptrdiff_t src_stride, \
- int x_offset, int y_offset, \
- const uint8_t *dst, \
- ptrdiff_t dst_stride, \
- int height, unsigned int *sse); \
-int vp9_sub_pixel_variance8xh_##opt1(const uint8_t *src, \
- ptrdiff_t src_stride, \
- int x_offset, int y_offset, \
- const uint8_t *dst, \
- ptrdiff_t dst_stride, \
- int height, unsigned int *sse); \
-int vp9_sub_pixel_variance16xh_##opt1(const uint8_t *src, \
- ptrdiff_t src_stride, \
- int x_offset, int y_offset, \
- const uint8_t *dst, \
- ptrdiff_t dst_stride, \
- int height, unsigned int *sse)
+DECL(4, opt2); \
+DECL(8, opt1); \
+DECL(16, opt1)
DECLS(sse2, sse);
DECLS(ssse3, ssse3);
#undef DECLS
+#undef DECL
#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
@@ -403,6 +396,86 @@
} \
} \
*sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
+FN(16, 8, 16, 4, 3, opt1,); \
+FN(8, 16, 8, 3, 4, opt1,); \
+FN(8, 8, 8, 3, 3, opt1,); \
+FN(8, 4, 8, 3, 2, opt1,); \
+FN(4, 8, 4, 2, 3, opt2,); \
+FN(4, 4, 4, 2, 2, opt2,)
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
+
+#define DECL(w, opt) \
+int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int x_offset, int y_offset, \
+ const uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ const uint8_t *sec, \
+ ptrdiff_t sec_stride, \
+ int height, unsigned int *sse)
+#define DECLS(opt1, opt2) \
+DECL(4, opt2); \
+DECL(8, opt1); \
+DECL(16, opt1)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
+ int src_stride, \
+ int x_offset, \
+ int y_offset, \
+ const uint8_t *dst, \
+ int dst_stride, \
+ unsigned int *sseptr, \
+ const uint8_t *sec) { \
+ unsigned int sse; \
+ int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, \
+ sec, w, h, &sse); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
+ x_offset, y_offset, \
+ dst + 16, dst_stride, \
+ sec + 16, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
+ x_offset, y_offset, \
+ dst + 32, dst_stride, \
+ sec + 32, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
+ x_offset, y_offset, \
+ dst + 48, dst_stride, \
+ sec + 48, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sseptr = sse; \
return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}