shithub: libvpx

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -1109,6 +1109,15 @@

 add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";

 specialize qw/vp9_avg_4x4 sse2/;

+add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";

+specialize qw/vp9_int_pro_row sse2/;

+add_proto qw/int16_t vp9_int_pro_col/, "uint8_t const *ref, const int width";

+specialize qw/vp9_int_pro_col sse2/;

+add_proto qw/int vp9_vector_sad/, "int16_t const *ref, int16_t const *src, const int width";

+specialize qw/vp9_vector_sad sse2/;

 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

   add_proto qw/unsigned int vp9_highbd_avg_8x8/, "const uint8_t *, int p";

   specialize qw/vp9_highbd_avg_8x8/;

--- a/vp9/encoder/vp9_avg.c

+++ b/vp9/encoder/vp9_avg.c

@@ -28,6 +28,36 @@

   return (sum + 8) >> 4;

+// Integer projection onto row vectors.

+void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref,

+                       const int ref_stride, const int height) {

+  int idx;

+  for (idx = 0; idx < 16; ++idx) {

+    int i;

+    hbuf[idx] = 0;

+    for (i = 0; i < height; ++i)

+      hbuf[idx] += ref[i * ref_stride];

+    ++ref;

+  }

+}

+int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width) {

+  int idx;

+  int16_t sum = 0;

+  for (idx = 0; idx < width; ++idx)

+    sum += ref[idx];

+  return sum;

+}

+int vp9_vector_sad_c(int16_t const *ref, int16_t const *src,

+                     const int width) {

+  int i;

+  int this_sad = 0;

+  for (i = 0; i < width; ++i)

+    this_sad += abs(ref[i] - src[i]);

+  return this_sad;

+}

 #if CONFIG_VP9_HIGHBITDEPTH

 unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {

   int i, j;

--- a/vp9/encoder/vp9_encodeframe.c

+++ b/vp9/encoder/vp9_encodeframe.c

@@ -507,7 +507,120 @@

+#if CONFIG_VP9_HIGHBITDEPTH

+#define GLOBAL_MOTION 0

+#else

+#define GLOBAL_MOTION 1

+#endif

+#if GLOBAL_MOTION

+static int vector_match(int16_t *ref, int16_t *src) {

+  int best_sad = INT_MAX;

+  int this_sad;

+  int d;

+  int center, offset = 0;

+  for (d = 0; d <= 64; d += 16) {

+    this_sad = vp9_vector_sad(&ref[d], src, 64);

+    if (this_sad < best_sad) {

+      best_sad = this_sad;

+      offset = d;

+    }

+  }

+  center = offset;

+  for (d = -8; d <= 8; d += 4) {

+    int this_pos = offset + d;

+    // check limit

+    if (this_pos < 0 || this_pos > 64 || this_pos == 32)

+      continue;

+    this_sad = vp9_vector_sad(&ref[this_pos], src, 64);

+    if (this_sad < best_sad) {

+      best_sad = this_sad;

+      center = this_pos;

+    }

+  }

+  offset = center;

+  for (d = -4; d <= 4; d += 2) {

+    int this_pos = offset + d;

+    // check limit

+    if (this_pos < 0 || this_pos > 64 || this_pos == 32)

+      continue;

+    this_sad = vp9_vector_sad(&ref[this_pos], src, 64);

+    if (this_sad < best_sad) {

+      best_sad = this_sad;

+      center = this_pos;

+    }

+  }

+  offset = center;

+  for (d = -2; d <= 2; d += 1) {

+    int this_pos = offset + d;

+    // check limit

+    if (this_pos < 0 || this_pos > 64 || this_pos == 32)

+      continue;

+    this_sad = vp9_vector_sad(&ref[this_pos], src, 64);

+    if (this_sad < best_sad) {

+      best_sad = this_sad;

+      center = this_pos;

+    }

+  }

+  return (center - 32);

+}

+static void motion_estimation(MACROBLOCK *x) {

+  MACROBLOCKD *xd = &x->e_mbd;

+  DECLARE_ALIGNED(16, int16_t, hbuf[128]);

+  DECLARE_ALIGNED(16, int16_t, vbuf[128]);

+  DECLARE_ALIGNED(16, int16_t, src_hbuf[64]);

+  DECLARE_ALIGNED(16, int16_t, src_vbuf[64]);

+  int idx;

+  const int stride = 64;

+  const int search_width = 128;

+  const int search_height = 128;

+  const int src_stride = x->plane[0].src.stride;

+  const int ref_stride = xd->plane[0].pre[0].stride;

+  uint8_t const *ref_buf, *src_buf;

+  MV *tmp_mv = &xd->mi[0].src_mi->mbmi.mv[0].as_mv;

+  // Set up prediction 1-D reference set

+  ref_buf = xd->plane[0].pre[0].buf + (-32);

+  for (idx = 0; idx < search_width; idx += 16) {

+    vp9_int_pro_row(&hbuf[idx], ref_buf, ref_stride, 64);

+    ref_buf += 16;

+  }

+  ref_buf = xd->plane[0].pre[0].buf + (-32) * ref_stride;

+  for (idx = 0; idx < search_height; ++idx) {

+    vbuf[idx] = vp9_int_pro_col(ref_buf, 64);

+    ref_buf += ref_stride;

+  }

+  // Set up src 1-D reference set

+  for (idx = 0; idx < stride; idx += 16) {

+    src_buf = x->plane[0].src.buf + idx;

+    vp9_int_pro_row(&src_hbuf[idx], src_buf, src_stride, 64);

+  }

+  src_buf = x->plane[0].src.buf;

+  for (idx = 0; idx < stride; ++idx) {

+    src_vbuf[idx] = vp9_int_pro_col(src_buf, 64);

+    src_buf += src_stride;

+  }

+  // Find the best match per 1-D search

+  tmp_mv->col = vector_match(hbuf, src_hbuf);

+  tmp_mv->row = vector_match(vbuf, src_vbuf);

+  tmp_mv->row *= 8;

+  tmp_mv->col *= 8;

+  x->pred_mv[LAST_FRAME] = *tmp_mv;

+}

+#endif

 // This function chooses partitioning based on the variance between source and

 // reconstructed last, where variance is computed for downs-sampled inputs.

 static void choose_partitioning(VP9_COMP *cpi,

@@ -551,6 +664,11 @@

     mbmi->ref_frame[1] = NONE;

     mbmi->sb_type = BLOCK_64X64;

     mbmi->mv[0].as_int = 0;

+#if GLOBAL_MOTION

+    motion_estimation(x);

+#endif

     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);

     for (i = 1; i <= 2; ++i) {

--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c

+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c

@@ -56,3 +56,117 @@

   avg = _mm_extract_epi16(s0, 0);

   return (avg + 8) >> 4;

+void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,

+                          const int ref_stride, const int height) {

+  int idx;

+  __m128i zero = _mm_setzero_si128();

+  __m128i src_line = _mm_load_si128((const __m128i *)ref);

+  __m128i s0 = _mm_unpacklo_epi8(src_line, zero);

+  __m128i s1 = _mm_unpackhi_epi8(src_line, zero);

+  __m128i t0, t1;

+  int height_1 = height - 1;

+  ref += ref_stride;

+  for (idx = 1; idx < height_1; idx += 2) {

+    src_line = _mm_load_si128((const __m128i *)ref);

+    t0 = _mm_unpacklo_epi8(src_line, zero);

+    t1 = _mm_unpackhi_epi8(src_line, zero);

+    s0 = _mm_adds_epu16(s0, t0);

+    s1 = _mm_adds_epu16(s1, t1);

+    ref += ref_stride;

+    src_line = _mm_load_si128((const __m128i *)ref);

+    t0 = _mm_unpacklo_epi8(src_line, zero);

+    t1 = _mm_unpackhi_epi8(src_line, zero);

+    s0 = _mm_adds_epu16(s0, t0);

+    s1 = _mm_adds_epu16(s1, t1);

+    ref += ref_stride;

+  }

+  src_line = _mm_load_si128((const __m128i *)ref);

+  t0 = _mm_unpacklo_epi8(src_line, zero);

+  t1 = _mm_unpackhi_epi8(src_line, zero);

+  s0 = _mm_adds_epu16(s0, t0);

+  s1 = _mm_adds_epu16(s1, t1);

+  _mm_store_si128((__m128i *)hbuf, s0);

+  hbuf += 8;

+  _mm_store_si128((__m128i *)hbuf, s1);

+}

+int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {

+  __m128i zero = _mm_setzero_si128();

+  __m128i src_line = _mm_load_si128((const __m128i *)ref);

+  __m128i s0 = _mm_sad_epu8(src_line, zero);

+  __m128i s1;

+  (void) width;  // width = 64

+  ref += 16;

+  src_line = _mm_load_si128((const __m128i *)ref);

+  s1 = _mm_sad_epu8(src_line, zero);

+  s0 = _mm_adds_epu16(s0, s1);

+  ref += 16;

+  src_line = _mm_load_si128((const __m128i *)ref);

+  s1 = _mm_sad_epu8(src_line, zero);

+  s0 = _mm_adds_epu16(s0, s1);

+  ref += 16;

+  src_line = _mm_load_si128((const __m128i *)ref);

+  s1 = _mm_sad_epu8(src_line, zero);

+  s0 = _mm_adds_epu16(s0, s1);

+  s1 = _mm_srli_si128(s0, 8);

+  s0 = _mm_adds_epu16(s0, s1);

+  return _mm_extract_epi16(s0, 0);

+}

+int vp9_vector_sad_sse2(int16_t const *ref, int16_t const *src,

+                        const int width) {

+  int idx;

+  __m128i zero = _mm_setzero_si128();

+  __m128i sum;

+  __m128i v0 = _mm_loadu_si128((const __m128i *)ref);

+  __m128i v1 = _mm_load_si128((const __m128i *)src);

+  __m128i diff = _mm_subs_epi16(v0, v1);

+  __m128i sign = _mm_srai_epi16(diff, 15);

+  diff = _mm_xor_si128(diff, sign);

+  sum = _mm_sub_epi16(diff, sign);

+  (void) width;  // width = 64;

+  ref += 8;

+  src += 8;

+  v0 = _mm_unpacklo_epi16(sum, zero);

+  v1 = _mm_unpackhi_epi16(sum, zero);

+  sum = _mm_add_epi32(v0, v1);

+  for (idx = 1; idx < 8; ++idx) {

+    v0 = _mm_loadu_si128((const __m128i *)ref);

+    v1 = _mm_load_si128((const __m128i *)src);

+    diff = _mm_subs_epi16(v0, v1);

+    sign = _mm_srai_epi16(diff, 15);

+    diff = _mm_xor_si128(diff, sign);

+    diff = _mm_sub_epi16(diff, sign);

+    v0 = _mm_unpacklo_epi16(diff, zero);

+    v1 = _mm_unpackhi_epi16(diff, zero);

+    sum = _mm_add_epi32(sum, v0);

+    sum = _mm_add_epi32(sum, v1);

+    ref += 8;

+    src += 8;

+  }

+  v0 = _mm_srli_si128(sum, 8);

+  sum = _mm_add_epi32(sum, v0);

+  v0 = _mm_srli_epi64(sum, 32);

+  sum = _mm_add_epi32(sum, v0);

+  return _mm_cvtsi128_si32(sum);

+}