shithub: libvpx

--- a/vp9/common/vp9_rtcd_defs.sh

+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -558,7 +558,7 @@

 specialize vp9_get_mb_ss mmx sse2

 # ENCODEMB INVOKE

-prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size"

+prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"

 specialize vp9_block_error sse2

 prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"

--- a/vp9/encoder/vp9_block.h

+++ b/vp9/encoder/vp9_block.h

@@ -68,7 +68,6 @@

   int16_t *quant;

   uint8_t *quant_shift;

   int16_t *zbin;

-  int16_t *zrun_zbin_boost;

   int16_t *round;

   // Zbin Over Quant value

--- a/vp9/encoder/vp9_onyx_int.h

+++ b/vp9/encoder/vp9_onyx_int.h

@@ -268,11 +268,7 @@

   DECLARE_ALIGNED(16, unsigned char, a_quant_shift[QINDEX_RANGE][16]);

   DECLARE_ALIGNED(16, short, a_zbin[QINDEX_RANGE][16]);

   DECLARE_ALIGNED(16, short, a_round[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_a[QINDEX_RANGE][16]);

 #endif

-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);

   MACROBLOCK mb;

   VP9_COMMON common;

--- a/vp9/encoder/vp9_quantize.c

+++ b/vp9/encoder/vp9_quantize.c

@@ -21,8 +21,7 @@

 extern int enc_debug;

 #endif

-static void quantize(int16_t *zbin_boost_orig_ptr,

-                     int16_t *coeff_ptr, int n_coeffs, int skip_block,

+static void quantize(int16_t *coeff_ptr, int n_coeffs, int skip_block,

                      int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,

                      uint8_t *quant_shift_ptr,

                      int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,

@@ -31,8 +30,6 @@

   int i, rc, eob;

   int zbins[2], nzbins[2], zbin;

   int x, y, z, sz;

-  int zero_run = 0;

-  int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;

   int zero_flag = n_coeffs;

   vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));

@@ -65,8 +62,7 @@

       rc = scan[i];

       z  = coeff_ptr[rc];

-      zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]);

-      zero_run += (zero_run < 15);

+      zbin = (zbins[rc != 0]);

       sz = (z >> 31);                               // sign of z

       x  = (z ^ sz) - sz;

@@ -81,7 +77,6 @@

         if (y) {

           eob = i;                                  // last nonzero coeffs

-          zero_run = 0;                             // set zero_run

@@ -90,8 +85,7 @@

 // This function works well for large transform size.

-static void quantize_sparse(int16_t *zbin_boost_orig_ptr,

-                            int16_t *coeff_ptr, int n_coeffs, int skip_block,

+static void quantize_sparse(int16_t *coeff_ptr, int n_coeffs, int skip_block,

                             int16_t *zbin_ptr, int16_t *round_ptr,

                             int16_t *quant_ptr, uint8_t *quant_shift_ptr,

                             int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,

@@ -101,10 +95,7 @@

   int i, rc, eob;

   int zbins[2], nzbins[2], zbin;

   int x, y, z, sz;

-  int zero_run = 0;

-  int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;

   int idx = 0;

-  int pre_idx = 0;

   vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));

   vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));

@@ -135,11 +126,8 @@

       rc = scan[idx_arr[i]];

       // Calculate ZBIN

-      zero_run += idx_arr[i] - pre_idx;

-      if(zero_run > 15) zero_run = 15;

-      zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]);

+      zbin = (zbins[rc != 0]);

-      pre_idx = idx_arr[i];

       z = coeff_ptr[rc] * 2;

       sz = (z >> 31);                               // sign of z

       x  = (z ^ sz) - sz;                           // x = abs(z)

@@ -155,7 +143,6 @@

         if (y) {

           eob = idx_arr[i];                         // last nonzero coeffs

-          zero_run = -1;                            // set zero_run

@@ -189,8 +176,7 @@

     // Save index of picked coefficient in pre-scan pass.

     int idx_arr[1024];

-    quantize_sparse(mb->plane[plane].zrun_zbin_boost,

-                    BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),

+    quantize_sparse(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),

                     n_coeffs, mb->skip_block,

                     mb->plane[plane].zbin,

                     mb->plane[plane].round,

@@ -204,8 +190,7 @@

                     scan, idx_arr);

   else {

-    quantize(mb->plane[plane].zrun_zbin_boost,

-             BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),

+    quantize(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),

              n_coeffs, mb->skip_block,

              mb->plane[plane].zbin,

              mb->plane[plane].round,

@@ -226,8 +211,7 @@

   const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);

   const int *pt_scan = get_scan_4x4(tx_type);

-  quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,

-           BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),

+  quantize(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),

            16, mb->skip_block,

            mb->plane[pb_idx.plane].zbin,

            mb->plane[pb_idx.plane].round,

@@ -261,9 +245,6 @@

 #endif

   int q;

-  static const int zbin_boost[16] = { 0,  0,  0,  8,  8,  8, 10, 12,

-                                     14, 16, 20, 24, 28, 32, 36, 40 };

   for (q = 0; q < QINDEX_RANGE; q++) {

     int qzbin_factor = (vp9_dc_quant(q, 0) < 148) ? 84 : 80;

     int qrounding_factor = 48;

@@ -277,7 +258,6 @@

     cpi->y_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);

     cpi->y_round[q][0] = (qrounding_factor * quant_val) >> 7;

     cpi->common.y_dequant[q][0] = quant_val;

-    cpi->zrun_zbin_boost_y[q][0] = (quant_val * zbin_boost[0]) >> 7;

     quant_val = vp9_dc_quant(q, cpi->common.uv_dc_delta_q);

     invert_quant(cpi->uv_quant[q] + 0, cpi->uv_quant_shift[q] + 0, quant_val);

@@ -284,7 +264,6 @@

     cpi->uv_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);

     cpi->uv_round[q][0] = (qrounding_factor * quant_val) >> 7;

     cpi->common.uv_dequant[q][0] = quant_val;

-    cpi->zrun_zbin_boost_uv[q][0] = (quant_val * zbin_boost[0]) >> 7;

 #if CONFIG_ALPHA

     quant_val = vp9_dc_quant(q, cpi->common.a_dc_delta_q);

@@ -292,7 +271,6 @@

     cpi->a_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);

     cpi->a_round[q][0] = (qrounding_factor * quant_val) >> 7;

     cpi->common.a_dequant[q][0] = quant_val;

-    cpi->zrun_zbin_boost_a[q][0] = (quant_val * zbin_boost[0]) >> 7;

 #endif

     quant_val = vp9_ac_quant(q, 0);

@@ -310,15 +288,11 @@

       invert_quant(cpi->y_quant[q] + rc, cpi->y_quant_shift[q] + rc, quant_val);

       cpi->y_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);

       cpi->y_round[q][rc] = (qrounding_factor * quant_val) >> 7;

-      cpi->zrun_zbin_boost_y[q][i] =

-          ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);

       invert_quant(cpi->uv_quant[q] + rc, cpi->uv_quant_shift[q] + rc,

         quant_uv_val);

       cpi->uv_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7);

       cpi->uv_round[q][rc] = (qrounding_factor * quant_uv_val) >> 7;

-      cpi->zrun_zbin_boost_uv[q][i] =

-          ROUND_POWER_OF_TWO(quant_uv_val * zbin_boost[i], 7);

 #if CONFIG_ALPHA

       invert_quant(cpi->a_quant[q] + rc, cpi->a_quant_shift[q] + rc,

@@ -326,8 +300,6 @@

       cpi->a_zbin[q][rc] =

           ROUND_POWER_OF_TWO(qzbin_factor * quant_alpha_val, 7);

       cpi->a_round[q][rc] = (qrounding_factor * quant_alpha_val) >> 7;

-      cpi->zrun_zbin_boost_a[q][i] =

-          ROUND_POWER_OF_TWO(quant_alpha_val * zbin_boost[i], 7);

 #endif

@@ -348,7 +320,6 @@

   x->plane[0].quant_shift = cpi->y_quant_shift[qindex];

   x->plane[0].zbin = cpi->y_zbin[qindex];

   x->plane[0].round = cpi->y_round[qindex];

-  x->plane[0].zrun_zbin_boost = cpi->zrun_zbin_boost_y[qindex];

   x->plane[0].zbin_extra = (int16_t)zbin_extra;

   x->e_mbd.plane[0].dequant = cpi->common.y_dequant[qindex];

@@ -361,7 +332,6 @@

     x->plane[i].quant_shift = cpi->uv_quant_shift[qindex];

     x->plane[i].zbin = cpi->uv_zbin[qindex];

     x->plane[i].round = cpi->uv_round[qindex];

-    x->plane[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[qindex];

     x->plane[i].zbin_extra = (int16_t)zbin_extra;

     x->e_mbd.plane[i].dequant = cpi->common.uv_dequant[qindex];

@@ -371,7 +341,6 @@

   x->plane[3].quant_shift = cpi->a_quant_shift[qindex];

   x->plane[3].zbin = cpi->a_zbin[qindex];

   x->plane[3].round = cpi->a_round[qindex];

-  x->plane[3].zrun_zbin_boost = cpi->zrun_zbin_boost_a[qindex];

   x->plane[3].zbin_extra = (int16_t)zbin_extra;

   x->e_mbd.plane[3].dequant = cpi->common.a_dequant[qindex];

 #endif

--- a/vp9/encoder/vp9_rdopt.c

+++ b/vp9/encoder/vp9_rdopt.c

@@ -283,15 +283,17 @@

 int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,

-                          intptr_t block_size) {

+                          intptr_t block_size, int64_t *ssz) {

   int i;

-  int64_t error = 0;

+  int64_t error = 0, sqcoeff = 0;

   for (i = 0; i < block_size; i++) {

     int this_diff = coeff[i] - dqcoeff[i];

     error += (unsigned)this_diff * this_diff;

+    sqcoeff += (unsigned) coeff[i] * coeff[i];

+  *ssz = sqcoeff;

   return error;

@@ -501,27 +503,31 @@

 static int64_t block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,

-                               int shift) {

+                               int shift, int64_t *sse) {

   struct macroblockd_plane *p = &x->e_mbd.plane[0];

   const int bw = plane_block_width(bsize, p);

   const int bh = plane_block_height(bsize, p);

-  return vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,

-                         bw * bh) >> shift;

+  int64_t e = vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,

+                              bw * bh, sse) >> shift;

+  *sse >>= shift;

+  return e;

 static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,

-                                int shift) {

-  int64_t sum = 0;

+                                int shift, int64_t *sse) {

+  int64_t sum = 0, this_sse;

   int plane;

+  *sse = 0;

   for (plane = 1; plane < MAX_MB_PLANE; plane++) {

     struct macroblockd_plane *p = &x->e_mbd.plane[plane];

     const int bw = plane_block_width(bsize, p);

     const int bh = plane_block_height(bsize, p);

     sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,

-                           bw * bh);

+                           bw * bh, &this_sse);

+    *sse += this_sse;

+  *sse >>= shift;

   return sum >> shift;

@@ -581,7 +587,7 @@

 static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,

                                      int *rate, int64_t *distortion,

-                                     int *skippable,

+                                     int *skippable, int64_t *sse,

                                      BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {

   MACROBLOCKD *const xd = &x->e_mbd;

   xd->mode_info_context->mbmi.txfm_size = tx_size;

@@ -591,7 +597,7 @@

   else

     vp9_xform_quant_sby(cm, x, bsize);

-  *distortion = block_error_sby(x, bsize, tx_size == TX_32X32 ? 0 : 2);

+  *distortion = block_error_sby(x, bsize, tx_size == TX_32X32 ? 0 : 2, sse);

   *rate       = rdcost_plane(cm, x, 0, bsize, tx_size);

   *skippable  = vp9_sby_is_skippable(xd, bsize);

@@ -598,11 +604,11 @@

 static void super_block_yrd(VP9_COMP *cpi,

                             MACROBLOCK *x, int *rate, int64_t *distortion,

-                            int *skip, BLOCK_SIZE_TYPE bs,

+                            int *skip, int64_t *psse, BLOCK_SIZE_TYPE bs,

                             int64_t txfm_cache[NB_TXFM_MODES]) {

   VP9_COMMON *const cm = &cpi->common;

   int r[TX_SIZE_MAX_SB][2], s[TX_SIZE_MAX_SB];

-  int64_t d[TX_SIZE_MAX_SB];

+  int64_t d[TX_SIZE_MAX_SB], sse[TX_SIZE_MAX_SB];

   MACROBLOCKD *xd = &x->e_mbd;

   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

@@ -621,25 +627,27 @@

       mbmi->txfm_size = TX_4X4;

     vpx_memset(txfm_cache, 0, NB_TXFM_MODES * sizeof(int64_t));

-    super_block_yrd_for_txfm(cm, x, rate, distortion, skip, bs,

+    super_block_yrd_for_txfm(cm, x, rate, distortion, skip, &sse[0], bs,

                              mbmi->txfm_size);

     return;

   if (bs >= BLOCK_SIZE_SB32X32)

     super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],

-                             bs, TX_32X32);

+                             &sse[TX_32X32], bs, TX_32X32);

   if (bs >= BLOCK_SIZE_MB16X16)

     super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],

-                             bs, TX_16X16);

-  super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], bs,

-                           TX_8X8);

-  super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], bs,

-                           TX_4X4);

+                             &sse[TX_16X16], bs, TX_16X16);

+  super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],

+                           &sse[TX_8X8], bs, TX_8X8);

+  super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],

+                           &sse[TX_4X4], bs, TX_4X4);

   choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,

                            skip, txfm_cache,

                            TX_32X32 - (bs < BLOCK_SIZE_SB32X32)

                            - (bs < BLOCK_SIZE_MB16X16));

+  if (psse)

+    *psse = sse[mbmi->txfm_size];

 static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,

@@ -688,6 +696,8 @@

     for (idy = 0; idy < bh; ++idy) {

       for (idx = 0; idx < bw; ++idx) {

+        int64_t ssz;

         block = ib + idy * 2 + idx;

         xd->mode_info_context->bmi[block].as_mode.first = mode;

         src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,

@@ -718,7 +728,8 @@

         ratey += cost_coeffs(cm, x, 0, block, PLANE_TYPE_Y_WITH_DC,

                              tempa + idx, templ + idy, TX_4X4, 16);

         distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff,

-                                                          block, 16), 16) >> 2;

+                                                          block, 16),

+                                      16, &ssz) >> 2;

         if (best_tx_type != DCT_DCT)

           vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),

@@ -881,7 +892,7 @@

     x->e_mbd.mode_info_context->mbmi.mode = mode;

-    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,

+    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,

                     bsize, local_txfm_cache);

     this_rate = this_rate_tokenonly + bmode_costs[mode];

@@ -914,15 +925,18 @@

 static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,

                                       int *rate, int64_t *distortion,

-                                      int *skippable, BLOCK_SIZE_TYPE bsize,

+                                      int *skippable, int64_t *sse,

+                                      BLOCK_SIZE_TYPE bsize,

                                       TX_SIZE uv_tx_size) {

   MACROBLOCKD *const xd = &x->e_mbd;

+  int64_t dummy;

   if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)

     vp9_encode_intra_block_uv(cm, x, bsize);

   else

     vp9_xform_quant_sbuv(cm, x, bsize);

-  *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2);

+  *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2,

+                                 sse ? sse : &dummy);

   *rate       = rdcost_uv(cm, x, bsize, uv_tx_size);

   *skippable  = vp9_sbuv_is_skippable(xd, bsize);

@@ -929,7 +943,7 @@

 static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,

                              int *rate, int64_t *distortion, int *skippable,

-                             BLOCK_SIZE_TYPE bsize) {

+                             int64_t *sse, BLOCK_SIZE_TYPE bsize) {

   MACROBLOCKD *const xd = &x->e_mbd;

   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

   TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);

@@ -937,7 +951,7 @@

   if (mbmi->ref_frame[0] > INTRA_FRAME)

     vp9_subtract_sbuv(x, bsize);

-  super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,

+  super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, sse, bsize,

                             uv_txfm_size);

@@ -954,7 +968,7 @@

   for (mode = DC_PRED; mode <= TM_PRED; mode++) {

     x->e_mbd.mode_info_context->mbmi.uv_mode = mode;

     super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,

-                     &this_distortion, &s, bsize);

+                     &this_distortion, &s, NULL, bsize);

     this_rate = this_rate_tokenonly +

                 x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];

     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);

@@ -1151,6 +1165,8 @@

   k = i;

   for (idy = 0; idy < bh / 4; ++idy) {

     for (idx = 0; idx < bw / 4; ++idx) {

+      int64_t ssz;

       k += (idy * 2 + idx);

       src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,

                                            x->plane[0].src_diff);

@@ -1159,7 +1175,7 @@

       x->quantize_b_4x4(x, k, DCT_DCT, 16);

       thisdistortion += vp9_block_error(coeff,

                                         BLOCK_OFFSET(xd->plane[0].dqcoeff,

-                                                     k, 16), 16);

+                                                     k, 16), 16, &ssz);

       thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC,

                               ta + (k & 1),

                               tl + (k >> 1), TX_4X4, 16);

@@ -2238,7 +2254,8 @@

                                  INTERPOLATIONFILTERTYPE *best_filter,

                                  int_mv *frame_mv,

                                  int mi_row, int mi_col,

-                                 int_mv single_newmv[MAX_REF_FRAMES]) {

+                                 int_mv single_newmv[MAX_REF_FRAMES],

+                                 int64_t *psse) {

   VP9_COMMON *cm = &cpi->common;

   MACROBLOCKD *xd = &x->e_mbd;

   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

@@ -2467,9 +2484,10 @@

   if (!x->skip) {

     int skippable_y, skippable_uv;

+    int64_t sseuv = INT_MAX;

     // Y cost and distortion

-    super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y,

+    super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,

                     bsize, txfm_cache);

     *rate2 += *rate_y;

@@ -2476,8 +2494,9 @@

     *distortion += *distortion_y;

     super_block_uvrd(cm, x, rate_uv, distortion_uv,

-                     &skippable_uv, bsize);

+                     &skippable_uv, &sseuv, bsize);

+    *psse += sseuv;

     *rate2 += *rate_uv;

     *distortion += *distortion_uv;

     *skippable = skippable_y && skippable_uv;

@@ -2611,6 +2630,7 @@

   int bws = (1 << bwsl) / 4;  // mode_info step for subsize

   int bhsl = b_height_log2(bsize);

   int bhs = (1 << bhsl) / 4;  // mode_info step for subsize

+  int best_skip2 = 0;

   for (i = 0; i < 4; i++) {

     int j;

@@ -2702,6 +2722,8 @@

     int skippable;

     int64_t txfm_cache[NB_TXFM_MODES];

     int i;

+    int this_skip2 = 0;

+    int64_t total_sse = INT_MAX;

     for (i = 0; i < NB_TXFM_MODES; ++i)

       txfm_cache[i] = INT64_MAX;

@@ -2863,7 +2885,7 @@

         txfm_cache[i] = txfm_cache[ONLY_4X4];

     } else if (ref_frame == INTRA_FRAME) {

       TX_SIZE uv_tx;

-      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,

+      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,

                       bsize, txfm_cache);

       uv_tx = mbmi->txfm_size;

@@ -2989,7 +3011,7 @@

                                       BLOCK_SIZE_SB8X8);

       vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);

       super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,

-                                &uv_skippable, BLOCK_SIZE_SB8X8, TX_4X4);

+                                &uv_skippable, NULL, BLOCK_SIZE_SB8X8, TX_4X4);

       rate2 += rate_uv;

       distortion2 += distortion_uv;

       skippable = skippable && uv_skippable;

@@ -3017,7 +3039,7 @@

                                   &mode_excluded, &disable_skip,

                                   &tmp_best_filter, frame_mv[this_mode],

                                   mi_row, mi_col,

-                                  single_newmv);

+                                  single_newmv, &total_sse);

       if (this_rd == INT64_MAX)

         continue;

@@ -3062,10 +3084,29 @@

             rate2 += prob_skip_cost;

+      } else if (mb_skip_allowed && ref_frame != INTRA_FRAME &&

+                 this_mode != SPLITMV) {

+        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <

+            RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {

+          // Add in the cost of the no skip flag.

+          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,

+                                                          PRED_MBSKIP), 0);

+          rate2 += prob_skip_cost;

+        } else {

+          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,

+                                                              PRED_MBSKIP), 1);

+          rate2 += prob_skip_cost;

+          distortion2 = total_sse;

+          assert(total_sse >= 0);

+          rate2 -= (rate_y + rate_uv);

+          rate_y = 0;

+          rate_uv = 0;

+          this_skip2 = 1;

+        }

       } else if (mb_skip_allowed) {

         // Add in the cost of the no skip flag.

         int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,

-                                                        PRED_MBSKIP), 0);

+                                                            PRED_MBSKIP), 0);

         rate2 += prob_skip_cost;

@@ -3119,6 +3160,7 @@

         *returndistortion = distortion2;

         best_rd = this_rd;

         best_mbmode = *mbmi;

+        best_skip2 = this_skip2;

         best_partition = *x->partition_info;

         if (this_mode == I4X4_PRED || this_mode == SPLITMV)

@@ -3301,6 +3343,7 @@

   // macroblock modes

   *mbmi = best_mbmode;

+  x->skip |= best_skip2;

   if (best_mbmode.ref_frame[0] == INTRA_FRAME &&

       best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {

     for (i = 0; i < 4; i++)

--- a/vp9/encoder/x86/vp9_error_sse2.asm

+++ b/vp9/encoder/x86/vp9_error_sse2.asm

@@ -12,20 +12,22 @@

 SECTION .text

-; void vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size)

+; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,

+;                         int64_t *ssz)

 INIT_XMM sse2

-cglobal block_error, 3, 3, 6, uqc, dqc, size

-  pxor      m4, m4                 ; accumulator

+cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz

+  pxor      m4, m4                 ; sse accumulator

+  pxor      m6, m6                 ; ssz accumulator

   pxor      m5, m5                 ; dedicated zero register

   lea     uqcq, [uqcq+sizeq*2]

   lea     dqcq, [dqcq+sizeq*2]

   neg    sizeq

 .loop:

-  mova      m0, [uqcq+sizeq*2]

-  mova      m2, [dqcq+sizeq*2]

-  mova      m1, [uqcq+sizeq*2+mmsize]

-  mova      m3, [dqcq+sizeq*2+mmsize]

+  mova      m2, [uqcq+sizeq*2]

+  mova      m0, [dqcq+sizeq*2]

+  mova      m3, [uqcq+sizeq*2+mmsize]

+  mova      m1, [dqcq+sizeq*2+mmsize]

   psubw     m0, m2

   psubw     m1, m3

   ; individual errors are max. 15bit+sign, so squares are 30bit, and

@@ -32,25 +34,40 @@

   ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)

   pmaddwd   m0, m0

   pmaddwd   m1, m1

+  pmaddwd   m2, m2

+  pmaddwd   m3, m3

   ; accumulate in 64bit

-  punpckldq m2, m0, m5

+  punpckldq m7, m0, m5

   punpckhdq m0, m5

-  punpckldq m3, m1, m5

-  punpckhdq m1, m5

-  paddq     m4, m2

+  paddq     m4, m7

+  punpckldq m7, m1, m5

   paddq     m4, m0

-  paddq     m4, m3

+  punpckhdq m1, m5

+  paddq     m4, m7

+  punpckldq m7, m2, m5

   paddq     m4, m1

+  punpckhdq m2, m5

+  paddq     m6, m7

+  punpckldq m7, m3, m5

+  paddq     m6, m2

+  punpckhdq m3, m5

+  paddq     m6, m7

+  paddq     m6, m3

   add    sizeq, mmsize

   jl .loop

   ; accumulate horizontally and store in return value

   movhlps   m5, m4

+  movhlps   m7, m6

   paddq     m4, m5

+  paddq     m6, m7

 %if ARCH_X86_64

   movq    rax, m4

+  movq [sszq], m6

 %else

+  mov     eax, sszm

   pshufd   m5, m4, 0x1

+  movq  [eax], m6

   movd    eax, m4

   movd    edx, m5

 %endif