ref: 1790d45252efb29903baae3d1776bb24cee76558
parent: f4e0eb17e85eb84c162ac0218cf32a9eccece353
author: Jingning Han <[email protected]>
date: Fri Feb 27 08:35:22 EST 2015
Use variance metric for integral projection vector match This commit replaces the SAD with variance as metric for the integral projection vector match. It improves the search accuracy in the presence of slight light change. The average speed -6 compression performance for rtc set is improved by 1.7%. No speed changes are observed for the test clips. Change-Id: I71c1d27e42de2aa429fb3564e6549bba1c7d6d4d
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1115,8 +1115,8 @@
add_proto qw/int16_t vp9_int_pro_col/, "uint8_t const *ref, const int width";
specialize qw/vp9_int_pro_col sse2/;
-add_proto qw/int vp9_vector_sad/, "int16_t const *ref, int16_t const *src, const int width";
-specialize qw/vp9_vector_sad sse2/;
+add_proto qw/int vp9_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";
+specialize qw/vp9_vector_var sse2/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vp9_highbd_avg_8x8/, "const uint8_t *, int p";
--- a/vp9/encoder/vp9_avg.c
+++ b/vp9/encoder/vp9_avg.c
@@ -37,6 +37,7 @@
hbuf[idx] = 0;
for (i = 0; i < height; ++i)
hbuf[idx] += ref[i * ref_stride];
+ hbuf[idx] /= 32;
++ref;
}
}
@@ -46,16 +47,23 @@
int16_t sum = 0;
for (idx = 0; idx < width; ++idx)
sum += ref[idx];
- return sum;
+ return sum / 32;
}
-int vp9_vector_sad_c(int16_t const *ref, int16_t const *src,
- const int width) {
+int vp9_vector_var_c(int16_t const *ref, int16_t const *src,
+ const int bwl) {
int i;
- int this_sad = 0;
- for (i = 0; i < width; ++i)
- this_sad += abs(ref[i] - src[i]);
- return this_sad;
+ int width = 4 << bwl;
+ int sse = 0, mean = 0, var;
+
+ for (i = 0; i < width; ++i) {
+ int diff = ref[i] - src[i];
+ mean += diff;
+ sse += diff * diff;
+ }
+
+ var = sse - ((mean * mean) >> (bwl + 2));
+ return var;
}
#if CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -519,14 +519,14 @@
#endif
#if GLOBAL_MOTION
-static int vector_match(int16_t *ref, int16_t *src, int length) {
+static int vector_match(int16_t *ref, int16_t *src, int bwl) {
int best_sad = INT_MAX;
int this_sad;
int d;
int center, offset = 0;
- int bw = length; // redundant variable, to be changed in the experiments.
+ int bw = 4 << bwl; // redundant variable, to be changed in the experiments.
for (d = 0; d <= bw; d += 16) {
- this_sad = vp9_vector_sad(&ref[d], src, length);
+ this_sad = vp9_vector_var(&ref[d], src, bwl);
if (this_sad < best_sad) {
best_sad = this_sad;
offset = d;
@@ -539,7 +539,7 @@
// check limit
if (this_pos < 0 || this_pos > bw)
continue;
- this_sad = vp9_vector_sad(&ref[this_pos], src, length);
+ this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
if (this_sad < best_sad) {
best_sad = this_sad;
center = this_pos;
@@ -552,7 +552,7 @@
// check limit
if (this_pos < 0 || this_pos > bw)
continue;
- this_sad = vp9_vector_sad(&ref[this_pos], src, length);
+ this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
if (this_sad < best_sad) {
best_sad = this_sad;
center = this_pos;
@@ -565,7 +565,7 @@
// check limit
if (this_pos < 0 || this_pos > bw)
continue;
- this_sad = vp9_vector_sad(&ref[this_pos], src, length);
+ this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
if (this_sad < best_sad) {
best_sad = this_sad;
center = this_pos;
@@ -578,7 +578,7 @@
// check limit
if (this_pos < 0 || this_pos > bw)
continue;
- this_sad = vp9_vector_sad(&ref[this_pos], src, length);
+ this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
if (this_sad < best_sad) {
best_sad = this_sad;
center = this_pos;
@@ -638,8 +638,8 @@
}
// Find the best match per 1-D search
- tmp_mv->col = vector_match(hbuf, src_hbuf, bw);
- tmp_mv->row = vector_match(vbuf, src_vbuf, bh);
+ tmp_mv->col = vector_match(hbuf, src_hbuf, b_width_log2_lookup[bsize]);
+ tmp_mv->row = vector_match(vbuf, src_vbuf, b_height_log2_lookup[bsize]);
best_sad = INT_MAX;
this_mv = *tmp_mv;
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -192,7 +192,6 @@
cond_cost_list(cpi, cost_list),
x->nmvjointcost, x->mvcost,
&dis, &x->pred_sse[ref], NULL, 0, 0);
- x->pred_mv[ref] = tmp_mv->as_mv;
}
if (scaled_ref_frame) {
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -90,6 +90,9 @@
s0 = _mm_adds_epu16(s0, t0);
s1 = _mm_adds_epu16(s1, t1);
+ s0 = _mm_srai_epi16(s0, 5);
+ s1 = _mm_srai_epi16(s1, 5);
+
_mm_store_si128((__m128i *)hbuf, s0);
hbuf += 8;
_mm_store_si128((__m128i *)hbuf, s1);
@@ -112,51 +115,49 @@
s1 = _mm_srli_si128(s0, 8);
s0 = _mm_adds_epu16(s0, s1);
- return _mm_extract_epi16(s0, 0);
+ return (_mm_extract_epi16(s0, 0)) >> 5;
}
-int vp9_vector_sad_sse2(int16_t const *ref, int16_t const *src,
- const int width) {
+int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src,
+ const int bwl) {
int idx;
- __m128i zero = _mm_setzero_si128();
- __m128i sum;
+ int width = 4 << bwl;
+ int16_t mean;
__m128i v0 = _mm_loadu_si128((const __m128i *)ref);
__m128i v1 = _mm_load_si128((const __m128i *)src);
__m128i diff = _mm_subs_epi16(v0, v1);
- __m128i sign = _mm_srai_epi16(diff, 15);
+ __m128i sum = diff;
+ __m128i sse = _mm_madd_epi16(diff, diff);
- diff = _mm_xor_si128(diff, sign);
- sum = _mm_sub_epi16(diff, sign);
-
ref += 8;
src += 8;
- v0 = _mm_unpacklo_epi16(sum, zero);
- v1 = _mm_unpackhi_epi16(sum, zero);
- sum = _mm_add_epi32(v0, v1);
-
for (idx = 8; idx < width; idx += 8) {
v0 = _mm_loadu_si128((const __m128i *)ref);
v1 = _mm_load_si128((const __m128i *)src);
diff = _mm_subs_epi16(v0, v1);
- sign = _mm_srai_epi16(diff, 15);
- diff = _mm_xor_si128(diff, sign);
- diff = _mm_sub_epi16(diff, sign);
- v0 = _mm_unpacklo_epi16(diff, zero);
- v1 = _mm_unpackhi_epi16(diff, zero);
+ sum = _mm_add_epi16(sum, diff);
+ v0 = _mm_madd_epi16(diff, diff);
+ sse = _mm_add_epi32(sse, v0);
- sum = _mm_add_epi32(sum, v0);
- sum = _mm_add_epi32(sum, v1);
-
ref += 8;
src += 8;
}
- v0 = _mm_srli_si128(sum, 8);
- sum = _mm_add_epi32(sum, v0);
- v0 = _mm_srli_epi64(sum, 32);
- sum = _mm_add_epi32(sum, v0);
+ v0 = _mm_srli_si128(sum, 8);
+ sum = _mm_add_epi16(sum, v0);
+ v0 = _mm_srli_epi64(sum, 32);
+ sum = _mm_add_epi16(sum, v0);
+ v0 = _mm_srli_epi32(sum, 16);
+ sum = _mm_add_epi16(sum, v0);
- return _mm_cvtsi128_si32(sum);
+ v1 = _mm_srli_si128(sse, 8);
+ sse = _mm_add_epi32(sse, v1);
+ v1 = _mm_srli_epi64(sse, 32);
+ sse = _mm_add_epi32(sse, v1);
+
+ mean = _mm_extract_epi16(sum, 0);
+
+ return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
}