shithub: libvpx

--- /dev/null

+++ b/test/sum_squares_test.cc

@@ -1,0 +1,115 @@

+/*

+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <cmath>

+#include <cstdlib>

+#include <string>

+#include "third_party/googletest/src/include/gtest/gtest.h"

+#include "./vpx_config.h"

+#include "./vpx_dsp_rtcd.h"

+#include "test/acm_random.h"

+#include "test/clear_system_state.h"

+#include "test/register_state_check.h"

+#include "test/util.h"

+#include "vpx_ports/mem.h"

+using libvpx_test::ACMRandom;

+namespace {

+const int kNumIterations = 10000;

+typedef uint64_t (*SSI16Func)(const int16_t *src, int stride, int size);

+typedef std::tr1::tuple<SSI16Func, SSI16Func> SumSquaresParam;

+class SumSquaresTest : public ::testing::TestWithParam<SumSquaresParam> {

+ public:

+  virtual ~SumSquaresTest() {}

+  virtual void SetUp() {

+    ref_func_ = GET_PARAM(0);

+    tst_func_ = GET_PARAM(1);

+  }

+  virtual void TearDown() { libvpx_test::ClearSystemState(); }

+ protected:

+  SSI16Func ref_func_;

+  SSI16Func tst_func_;

+};

+TEST_P(SumSquaresTest, OperationCheck) {

+  ACMRandom rnd(ACMRandom::DeterministicSeed());

+  DECLARE_ALIGNED(16, int16_t, src[256 * 256]);

+  const int msb = 11;  // Up to 12 bit input

+  const int limit = 1 << (msb + 1);

+  for (int k = 0; k < kNumIterations; k++) {

+    const int size = 4 << rnd(6);    // Up to 128x128

+    int stride = 4 << rnd(7);  // Up to 256 stride

+    while (stride < size) {   // Make sure it's valid

+      stride = 4 << rnd(7);

+    }

+    for (int i = 0; i < size; ++i) {

+      for (int j = 0; j < size; ++j) {

+        src[i * stride + j] = rnd(2) ? rnd(limit) : -rnd(limit);

+      }

+    }

+    const uint64_t res_ref = ref_func_(src, stride, size);

+    uint64_t res_tst;

+    ASM_REGISTER_STATE_CHECK(res_tst = tst_func_(src, stride, size));

+    ASSERT_EQ(res_ref, res_tst)

+        << "Error: Sum Squares Test"

+        << " C output does not match optimized output.";

+  }

+}

+TEST_P(SumSquaresTest, ExtremeValues) {

+  ACMRandom rnd(ACMRandom::DeterministicSeed());

+  DECLARE_ALIGNED(16, int16_t, src[256 * 256]);

+  const int msb = 11;  // Up to 12 bit input

+  const int limit = 1 << (msb + 1);

+  for (int k = 0; k < kNumIterations; k++) {

+    const int size = 4 << rnd(6);    // Up to 128x128

+    int stride = 4 << rnd(7);  // Up to 256 stride

+    while (stride < size) {    // Make sure it's valid

+      stride = 4 << rnd(7);

+    }

+    const int val = rnd(2) ? limit - 1 : -(limit - 1);

+    for (int i = 0; i < size; ++i) {

+      for (int j = 0; j < size; ++j) {

+        src[i * stride + j] = val;

+      }

+    }

+    const uint64_t res_ref = ref_func_(src, stride, size);

+    uint64_t res_tst;

+    ASM_REGISTER_STATE_CHECK(res_tst = tst_func_(src, stride, size));

+    ASSERT_EQ(res_ref, res_tst)

+        << "Error: Sum Squares Test"

+        << " C output does not match optimized output.";

+  }

+}

+using std::tr1::make_tuple;

+#if HAVE_SSE2

+INSTANTIATE_TEST_CASE_P(

+    SSE2, SumSquaresTest,

+    ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,

+                                 &vpx_sum_squares_2d_i16_sse2)));

+#endif  // HAVE_SSE2

+}  // namespace

--- a/test/test.mk

+++ b/test/test.mk

@@ -170,6 +170,7 @@

 ## Multi-codec / unconditional whitebox tests.

 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc

+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sum_squares_test.cc

 TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc

 TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c

--- a/vp9/encoder/vp9_rdopt.c

+++ b/vp9/encoder/vp9_rdopt.c

@@ -64,6 +64,7 @@

 } REF_DEFINITION;

 struct rdcost_block_args {

+  const VP9_COMP *cpi;

   MACROBLOCK *x;

   ENTROPY_CONTEXT t_above[16];

   ENTROPY_CONTEXT t_left[16];

@@ -463,38 +464,123 @@

   return cost;

-static void dist_block(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,

+static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, int block,

+                       int blk_row, int blk_col, TX_SIZE tx_size,

                        int64_t *out_dist, int64_t *out_sse) {

-  const int ss_txfrm_size = tx_size << 1;

   MACROBLOCKD* const xd = &x->e_mbd;

   const struct macroblock_plane *const p = &x->plane[plane];

   const struct macroblockd_plane *const pd = &xd->plane[plane];

-  int64_t this_sse;

-  int shift = tx_size == TX_32X32 ? 0 : 2;

-  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);

-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

+  if (cpi->sf.txfm_domain_distortion) {

+    const int ss_txfrm_size = tx_size << 1;

+    int64_t this_sse;

+    const int shift = tx_size == TX_32X32 ? 0 : 2;

+    const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);

+    const tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

 #if CONFIG_VP9_HIGHBITDEPTH

-  const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;

-  *out_dist = vp9_highbd_block_error_dispatch(coeff, dqcoeff,

-                                              16 << ss_txfrm_size,

-                                              &this_sse, bd) >> shift;

+    const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;

+    *out_dist = vp9_highbd_block_error_dispatch(

+                    coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse, bd) >>

+                shift;

 #else

-  *out_dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,

-                              &this_sse) >> shift;

+    *out_dist =

+        vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse) >>

+        shift;

 #endif  // CONFIG_VP9_HIGHBITDEPTH

-  *out_sse = this_sse >> shift;

+    *out_sse = this_sse >> shift;

-  if (x->skip_encode && !is_inter_block(xd->mi[0])) {

-    // TODO(jingning): tune the model to better capture the distortion.

-    int64_t p = (pd->dequant[1] * pd->dequant[1] *

-                    (1 << ss_txfrm_size)) >>

+    if (x->skip_encode && !is_inter_block(xd->mi[0])) {

+      // TODO(jingning): tune the model to better capture the distortion.

+      const int64_t p =

+          (pd->dequant[1] * pd->dequant[1] * (1 << ss_txfrm_size)) >>

 #if CONFIG_VP9_HIGHBITDEPTH

-                        (shift + 2 + (bd - 8) * 2);

+                  (shift + 2 + (bd - 8) * 2);

 #else

-                        (shift + 2);

+                  (shift + 2);

 #endif  // CONFIG_VP9_HIGHBITDEPTH

-    *out_dist += (p >> 4);

-    *out_sse  += p;

+      *out_dist += (p >> 4);

+      *out_sse += p;

+    }

+  } else {

+    const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];

+    const int bs = 4 * num_4x4_blocks_wide_lookup[tx_bsize];

+    const int src_stride = p->src.stride;

+    const int dst_stride = pd->dst.stride;

+    const int src_idx = 4 * (blk_row * src_stride + blk_col);

+    const int dst_idx = 4 * (blk_row * dst_stride + blk_col);

+    const uint8_t *src = &p->src.buf[src_idx];

+    const uint8_t *dst = &pd->dst.buf[dst_idx];

+    const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);

+    const uint16_t *eob = &p->eobs[block];

+    unsigned int tmp;

+    cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &tmp);

+    *out_sse = (int64_t)tmp * 16;

+    if (*eob) {

+#if CONFIG_VP9_HIGHBITDEPTH

+      DECLARE_ALIGNED(16, uint16_t, recon16[1024]);

+      uint8_t *recon = (uint8_t *)recon16;

+#else

+      DECLARE_ALIGNED(16, uint8_t, recon[1024]);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

+#if CONFIG_VP9_HIGHBITDEPTH

+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+        recon = CONVERT_TO_BYTEPTR(recon);

+        vpx_highbd_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0,

+                                 bs, bs, xd->bd);

+        if (xd->lossless) {

+          vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd);

+        } else {

+          switch (tx_size) {

+            case TX_4X4:

+              vp9_highbd_idct4x4_add(dqcoeff, recon, 32, *eob, xd->bd);

+              break;

+            case TX_8X8:

+              vp9_highbd_idct8x8_add(dqcoeff, recon, 32, *eob, xd->bd);

+              break;

+            case TX_16X16:

+              vp9_highbd_idct16x16_add(dqcoeff, recon, 32, *eob, xd->bd);

+              break;

+            case TX_32X32:

+              vp9_highbd_idct32x32_add(dqcoeff, recon, 32, *eob, xd->bd);

+              break;

+            default:

+              assert(0 && "Invalid transform size");

+          }

+        }

+      } else {

+#endif   // CONFIG_VP9_HIGHBITDEPTH

+        vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs);

+        switch (tx_size) {

+          case TX_32X32:

+            vp9_idct32x32_add(dqcoeff, recon, 32, *eob);

+            break;

+          case TX_16X16:

+            vp9_idct16x16_add(dqcoeff, recon, 32, *eob);

+            break;

+          case TX_8X8:

+            vp9_idct8x8_add(dqcoeff, recon, 32, *eob);

+            break;

+          case TX_4X4:

+            // this is like vp9_short_idct4x4 but has a special case around

+            // eob<=1, which is significant (not just an optimization) for

+            // the lossless case.

+            x->itxm_add(dqcoeff, recon, 32, *eob);

+            break;

+          default:

+            assert(0 && "Invalid transform size");

+            break;

+        }

+#if CONFIG_VP9_HIGHBITDEPTH

+      }

+#endif  // CONFIG_VP9_HIGHBITDEPTH

+      cpi->fn_ptr[tx_bsize].vf(src, src_stride, recon, 32, &tmp);

+    }

+    *out_dist = (int64_t)tmp * 16;

@@ -506,9 +592,8 @@

                      args->use_fast_coef_costing);

-static void block_rd_txfm(int plane, int block, int row, int col,

-                          BLOCK_SIZE plane_bsize,

-                          TX_SIZE tx_size, void *arg) {

+static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,

+                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {

   struct rdcost_block_args *args = arg;

   MACROBLOCK *const x = args->x;

   MACROBLOCKD *const xd = &x->e_mbd;

@@ -523,20 +608,47 @@

   if (!is_inter_block(mi)) {

     struct encode_b_args arg = {x, NULL, &mi->skip};

-    vp9_encode_block_intra(plane, block, row, col, plane_bsize, tx_size, &arg);

-    dist_block(x, plane, block, tx_size, &dist, &sse);

+    vp9_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,

+                           &arg);

+    if (args->cpi->sf.txfm_domain_distortion) {

+      dist_block(args->cpi, x, plane, block, blk_row, blk_col, tx_size, &dist,

+                 &sse);

+    } else {

+      const int bs = 4 << tx_size;

+      const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];

+      const vpx_variance_fn_t variance = args->cpi->fn_ptr[tx_bsize].vf;

+      const struct macroblock_plane *const p = &x->plane[plane];

+      const struct macroblockd_plane *const pd = &xd->plane[plane];

+      const int src_stride = p->src.stride;

+      const int dst_stride = pd->dst.stride;

+      const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];

+      const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];

+      const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];

+      const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];

+      unsigned int tmp;

+      sse = vpx_sum_squares_2d_i16(diff, diff_stride, bs);

+#if CONFIG_VP9_HIGHBITDEPTH

+      if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && (xd->bd > 8))

+        sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

+      sse = sse * 16;

+      variance(src, src_stride, dst, dst_stride, &tmp);

+      dist = (int64_t)tmp * 16;

+    }

   } else if (max_txsize_lookup[plane_bsize] == tx_size) {

     if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==

         SKIP_TXFM_NONE) {

       // full forward transform and quantization

-      vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size);

-      dist_block(x, plane, block, tx_size, &dist, &sse);

+      vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);

+      dist_block(args->cpi, x, plane, block, blk_row, blk_col, tx_size, &dist,

+                 &sse);

     } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==

                SKIP_TXFM_AC_ONLY) {

       // compute DC coefficient

       tran_low_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);

       tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);

-      vp9_xform_quant_dc(x, plane, block, row, col, plane_bsize, tx_size);

+      vp9_xform_quant_dc(x, plane, block, blk_row, blk_col, plane_bsize,

+                         tx_size);

       sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;

       dist = sse;

       if (x->plane[plane].eobs[block]) {

@@ -560,8 +672,9 @@

   } else {

     // full forward transform and quantization

-    vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size);

-    dist_block(x, plane, block, tx_size, &dist, &sse);

+    vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);

+    dist_block(args->cpi, x, plane, block, blk_row, blk_col, tx_size, &dist,

+               &sse);

   rd = RDCOST(x->rdmult, x->rddiv, 0, dist);

@@ -570,7 +683,7 @@

     return;

-  rate = rate_block(plane, block, row, col, tx_size, args);

+  rate = rate_block(plane, block, blk_row, blk_col, tx_size, args);

   rd1 = RDCOST(x->rdmult, x->rddiv, rate, dist);

   rd2 = RDCOST(x->rdmult, x->rddiv, 0, sse);

@@ -593,16 +706,15 @@

   args->skippable &= !x->plane[plane].eobs[block];

-static void txfm_rd_in_plane(MACROBLOCK *x,

-                             int *rate, int64_t *distortion,

-                             int *skippable, int64_t *sse,

-                             int64_t ref_best_rd, int plane,

-                             BLOCK_SIZE bsize, TX_SIZE tx_size,

-                             int use_fast_coef_casting) {

+static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,

+                             int64_t *distortion, int *skippable, int64_t *sse,

+                             int64_t ref_best_rd, int plane, BLOCK_SIZE bsize,

+                             TX_SIZE tx_size, int use_fast_coef_casting) {

   MACROBLOCKD *const xd = &x->e_mbd;

   const struct macroblockd_plane *const pd = &xd->plane[plane];

   struct rdcost_block_args args;

   vp9_zero(args);

+  args.cpi = cpi;

   args.x = x;

   args.best_rd = ref_best_rd;

   args.use_fast_coef_costing = use_fast_coef_casting;

@@ -643,8 +755,7 @@

   mi->tx_size = VPXMIN(max_tx_size, largest_tx_size);

-  txfm_rd_in_plane(x, rate, distortion, skip,

-                   sse, ref_best_rd, 0, bs,

+  txfm_rd_in_plane(cpi, x, rate, distortion, skip, sse, ref_best_rd, 0, bs,

                    mi->tx_size, cpi->sf.use_fast_coef_costing);

@@ -695,9 +806,8 @@

       else

         r_tx_size += vp9_cost_one(tx_probs[m]);

-    txfm_rd_in_plane(x, &r[n][0], &d[n], &s[n],

-                     &sse[n], ref_best_rd, 0, bs, n,

-                     cpi->sf.use_fast_coef_costing);

+    txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], ref_best_rd, 0,

+                     bs, n, cpi->sf.use_fast_coef_costing);

     r[n][1] = r[n][0];

     if (r[n][0] < INT_MAX) {

       r[n][1] += r_tx_size;

@@ -1172,9 +1282,8 @@

   *skippable = 1;

   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {

-    txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,

-                     ref_best_rd, plane, bsize, uv_tx_size,

-                     cpi->sf.use_fast_coef_costing);

+    txfm_rd_in_plane(cpi, x, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd,

+                     plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing);

     if (pnrate == INT_MAX) {

       is_cost_valid = 0;

       break;

--- a/vp9/encoder/vp9_speed_features.c

+++ b/vp9/encoder/vp9_speed_features.c

@@ -162,6 +162,7 @@

     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;

     sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;

     sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;

+    sf->txfm_domain_distortion = 1;

   if (speed >= 2) {

@@ -279,6 +280,7 @@

   sf->exhaustive_searches_thresh = INT_MAX;

   if (speed >= 1) {

+    sf->txfm_domain_distortion = 1;

     sf->use_square_partition_only = !frame_is_intra_only(cm);

     sf->less_rectangular_check = 1;

     sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD

@@ -541,6 +543,7 @@

   sf->disable_filter_search_var_thresh = 0;

   sf->adaptive_interp_filter_search = 0;

   sf->allow_partition_search_skip = 0;

+  sf->txfm_domain_distortion = 0;

   for (i = 0; i < TX_SIZES; i++) {

     sf->intra_y_mode_mask[i] = INTRA_ALL;

--- a/vp9/encoder/vp9_speed_features.h

+++ b/vp9/encoder/vp9_speed_features.h

@@ -246,6 +246,11 @@

   // Coefficient probability model approximation step size

   int coeff_prob_appx_step;

+  // Use transform domain distortion. Use pixel domain distortion when

+  // this flag is set to be zero. The pixel domain distortion computation

+  // improves the distortion metric precision.

+  int txfm_domain_distortion;

   // The threshold is to determine how slow the motino is, it is used when

   // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION

   MOTION_THRESHOLD lf_motion_threshold;

--- /dev/null

+++ b/vpx_dsp/sum_squares.c

@@ -1,0 +1,27 @@

+/*

+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vpx_dsp_rtcd.h"

+uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride,

+                                  int size) {

+  int r, c;

+  uint64_t ss = 0;

+  for (r = 0; r < size; r++) {

+    for (c = 0; c < size; c++) {

+      const int16_t v = src[c];

+      ss += v * v;

+    }

+    src += src_stride;

+  }

+  return ss;

+}

--- a/vpx_dsp/vpx_dsp.mk

+++ b/vpx_dsp/vpx_dsp.mk

@@ -264,6 +264,8 @@

 ifeq ($(CONFIG_ENCODERS),yes)

 DSP_SRCS-yes            += sad.c

 DSP_SRCS-yes            += subtract.c

+DSP_SRCS-yes            += sum_squares.c

+DSP_SRCS-$(HAVE_SSE2)   += x86/sum_squares_sse2.c

 DSP_SRCS-$(HAVE_MEDIA)  += arm/sad_media$(ASM)

 DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -1146,6 +1146,9 @@

 add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";

 specialize qw/vpx_sad4x4x4d msa sse2/;

+add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";

+specialize qw/vpx_sum_squares_2d_i16 sse2/;

 # Structured Similarity (SSIM)

--- /dev/null

+++ b/vpx_dsp/x86/sum_squares_sse2.c

@@ -1,0 +1,128 @@

+/*

+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <assert.h>

+#include <emmintrin.h>

+#include <stdio.h>

+#include "./vpx_dsp_rtcd.h"

+static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src,

+                                                int stride) {

+  const __m128i v_val_0_w =

+      _mm_loadl_epi64((const __m128i *)(src + 0 * stride));

+  const __m128i v_val_1_w =

+      _mm_loadl_epi64((const __m128i *)(src + 1 * stride));

+  const __m128i v_val_2_w =

+      _mm_loadl_epi64((const __m128i *)(src + 2 * stride));

+  const __m128i v_val_3_w =

+      _mm_loadl_epi64((const __m128i *)(src + 3 * stride));

+  const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);

+  const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);

+  const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);

+  const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);

+  const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);

+  const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);

+  const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);

+  const __m128i v_sum_d =

+      _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));

+  return (uint64_t)_mm_cvtsi128_si32(v_sum_d);

+}

+// TODO(jingning): Evaluate the performance impact here.

+#ifdef __GNUC__

+// This prevents GCC/Clang from inlining this function into

+// vpx_sum_squares_2d_i16_sse2, which in turn saves some stack

+// maintenance instructions in the common case of 4x4.

+__attribute__((noinline))

+#endif

+static uint64_t

+vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int size) {

+  int r, c;

+  const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);

+  __m128i v_acc_q = _mm_setzero_si128();

+  for (r = 0; r < size; r += 8) {

+    __m128i v_acc_d = _mm_setzero_si128();

+    for (c = 0; c < size; c += 8) {

+      const int16_t *b = src + c;

+      const __m128i v_val_0_w =

+          _mm_load_si128((const __m128i *)(b + 0 * stride));

+      const __m128i v_val_1_w =

+          _mm_load_si128((const __m128i *)(b + 1 * stride));

+      const __m128i v_val_2_w =

+          _mm_load_si128((const __m128i *)(b + 2 * stride));

+      const __m128i v_val_3_w =

+          _mm_load_si128((const __m128i *)(b + 3 * stride));

+      const __m128i v_val_4_w =

+          _mm_load_si128((const __m128i *)(b + 4 * stride));

+      const __m128i v_val_5_w =

+          _mm_load_si128((const __m128i *)(b + 5 * stride));

+      const __m128i v_val_6_w =

+          _mm_load_si128((const __m128i *)(b + 6 * stride));

+      const __m128i v_val_7_w =

+          _mm_load_si128((const __m128i *)(b + 7 * stride));

+      const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);

+      const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);

+      const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);

+      const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);

+      const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);

+      const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);

+      const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);

+      const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);

+      const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);

+      const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);

+      const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);

+      const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);

+      const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);

+      const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);

+      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);

+      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);

+    }

+    v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));

+    v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));

+    src += 8 * stride;

+  }

+  v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));

+#if ARCH_X86_64

+  return (uint64_t)_mm_cvtsi128_si64(v_acc_q);

+#else

+  {

+    uint64_t tmp;

+    _mm_storel_epi64((__m128i *)&tmp, v_acc_q);

+    return tmp;

+  }

+#endif

+}

+uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {

+  // 4 elements per row only requires half an XMM register, so this

+  // must be a special case, but also note that over 75% of all calls

+  // are with size == 4, so it is also the common case.

+  if (size == 4) {

+    return vpx_sum_squares_2d_i16_4x4_sse2(src, stride);

+  } else {

+    // Generic case

+    return vpx_sum_squares_2d_i16_nxn_sse2(src, stride, size);

+  }

+}