shithub: libvpx

Download patch

ref: ced21bd6a6c8427a69666f3c01ab2966ce845f32
parent: 1badebc8215bbb91e5445b91ccde57fddd369fbb
author: Jim Bankoski <[email protected]>
date: Thu May 30 11:13:08 EDT 2013

Creates a new speed 1:

This speed 1 - uses variance threshold stolen from static-thresh
to determine split.  Any superblock with greater than the variance
set by static thresh * quantizer index squared is split. In addition
transform size is set to largest size less than or equal to partition
size, sub pixel filter is set to normal,  and only 12 modes are used
at all.

Change-Id: If7a2858ee70f96d1eb989c04fd87a332b147abef

--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -208,7 +208,6 @@
 
 prototype void vp9_idct4_1d "int16_t *input, int16_t *output"
 specialize vp9_idct4_1d sse2
-
 # dct and add
 
 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
@@ -264,6 +263,10 @@
 
 prototype unsigned int vp9_variance8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance8x8 mmx sse2
+
+prototype void vp9_get_sse_sum_8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"
+specialize vp9_get_sse_sum_8x8 sse2
+vp9_get_sse_sum_8x8_sse2=vp9_get8x8var_sse2
 
 prototype unsigned int vp9_variance8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance8x4 sse2
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -10,6 +10,7 @@
 
 
 #include "./vpx_config.h"
+#include "./vp9_rtcd.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
@@ -97,7 +98,9 @@
   return vp9_encode_intra(cpi, x, use_dc_pred);
 }
 
+DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = { 0 };
 
+
 // Measure the activity of the current macroblock
 // What we measure here is TBD so abstracted to this function
 #define ALT_ACT_MEASURE 1
@@ -765,7 +768,36 @@
   vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl,
              sizeof(PARTITION_CONTEXT) * mh);
 }
+static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
+                          ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
+                          ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
+                          PARTITION_CONTEXT sa[8],
+                          PARTITION_CONTEXT sl[8],
+                          BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int p;
+  int bwl = b_width_log2(bsize), bw = 1 << bwl;
+  int bhl = b_height_log2(bsize), bh = 1 << bhl;
+  int mwl = mi_width_log2(bsize), mw = 1 << mwl;
+  int mhl = mi_height_log2(bsize), mh = 1 << mhl;
 
+  // buffer the above/left context information of the block in search.
+  for (p = 0; p < MAX_MB_PLANE; ++p) {
+    vpx_memcpy(a + bw * p, cm->above_context[p] +
+               (mi_col * 2 >> xd->plane[p].subsampling_x),
+               sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
+    vpx_memcpy(l + bh * p, cm->left_context[p] +
+               ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+               sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
+  }
+  vpx_memcpy(sa, cm->above_seg_context + mi_col,
+             sizeof(PARTITION_CONTEXT) * mw);
+  vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),
+             sizeof(PARTITION_CONTEXT) * mh);
+}
+
 static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
                      int mi_row, int mi_col, int output_enabled,
                      BLOCK_SIZE_TYPE bsize, int sub_index) {
@@ -857,7 +889,338 @@
   }
 }
 
+static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m,
+                             BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int mis = cm->mode_info_stride;
+  int bsl = b_width_log2(bsize);
+  int bs = (1 << bsl) / 2;  //
+  int block_row, block_col;
+  int row, col;
 
+  // this test function sets the entire macroblock to the same bsize
+  for (block_row = 0; block_row < 8; block_row += bs) {
+    for (block_col = 0; block_col < 8; block_col += bs) {
+      for (row = 0; row < bs; row++) {
+        for (col = 0; col < bs; col++) {
+          m[(block_row+row)*mis + block_col+col].mbmi.sb_type = bsize;
+        }
+      }
+    }
+  }
+}
+
+static void set_block_size(VP9_COMMON *const cm,
+                           MODE_INFO *m, BLOCK_SIZE_TYPE bsize, int mis,
+                           int mi_row, int mi_col) {
+  int row, col;
+  int bsl = b_width_log2(bsize);
+  int bs = (1 << bsl) / 2;  //
+  MODE_INFO *m2 = m + mi_row * mis + mi_col;
+  for (row = 0; row < bs; row++) {
+    for (col = 0; col < bs; col++) {
+      if (mi_row + row >= cm->mi_rows || mi_col + col >= cm->mi_cols)
+        return;
+      m2[row*mis+col].mbmi.sb_type = bsize;
+    }
+  }
+}
+typedef struct {
+  int64_t sum_square_error;
+  int64_t sum_error;
+  int count;
+  int variance;
+} var;
+
+#define VT(TYPE, BLOCKSIZE) \
+  typedef struct { \
+    var none; \
+    var horz[2]; \
+    var vert[2]; \
+    BLOCKSIZE split[4]; } TYPE;
+
+VT(v8x8, var)
+VT(v16x16, v8x8)
+VT(v32x32, v16x16)
+VT(v64x64, v32x32)
+
+typedef enum {
+  V16X16,
+  V32X32,
+  V64X64,
+} TREE_LEVEL;
+
+// Set variance values given sum square error, sum error, count.
+static void fill_variance(var *v, int64_t s2, int64_t s, int c) {
+  v->sum_square_error = s2;
+  v->sum_error = s;
+  v->count = c;
+  v->variance = 256
+      * (v->sum_square_error - v->sum_error * v->sum_error / v->count)
+      / v->count;
+}
+
+// Fills a 16x16 variance tree node by calling get var8x8 var..
+static void fill_16x16_variance(const unsigned char *s, int sp,
+                                const unsigned char *d, int dp, v16x16 *vt) {
+  unsigned int sse;
+  int sum;
+  vp9_get_sse_sum_8x8(s, sp, d, dp, &sse, &sum);
+  fill_variance(&vt->split[0].none, sse, sum, 64);
+  vp9_get_sse_sum_8x8(s + 8, sp, d + 8, dp, &sse, &sum);
+  fill_variance(&vt->split[1].none, sse, sum, 64);
+  vp9_get_sse_sum_8x8(s + 8 * sp, sp, d + 8 * dp, dp, &sse, &sum);
+  fill_variance(&vt->split[2].none, sse, sum, 64);
+  vp9_get_sse_sum_8x8(s + 8 * sp + 8, sp, d + 8 + 8 * dp, dp, &sse, &sum);
+  fill_variance(&vt->split[3].none, sse, sum, 64);
+}
+
+// Combine 2 variance structures by summing the sum_error, sum_square_error,
+// and counts and then calculating the new variance.
+void sum_2_variances(var *r, var *a, var*b) {
+  fill_variance(r, a->sum_square_error + b->sum_square_error,
+                a->sum_error + b->sum_error, a->count + b->count);
+}
+// Fill one level of our variance tree,  by summing the split sums into each of
+// the horizontal, vertical and none from split and recalculating variance.
+#define fill_variance_tree(VT) \
+  sum_2_variances(VT.horz[0], VT.split[0].none, VT.split[1].none); \
+  sum_2_variances(VT.horz[1], VT.split[2].none, VT.split[3].none); \
+  sum_2_variances(VT.vert[0], VT.split[0].none, VT.split[2].none); \
+  sum_2_variances(VT.vert[1], VT.split[1].none, VT.split[3].none); \
+  sum_2_variances(VT.none, VT.vert[0], VT.vert[1]);
+
+// Set the blocksize in the macroblock info structure if the variance is less
+// than our threshold to one of none, horz, vert.
+#define set_vt_size(VT, BLOCKSIZE, R, C, ACTION) \
+  if (VT.none.variance < threshold) { \
+    set_block_size(cm, m, BLOCKSIZE, mis, R, C); \
+    ACTION; \
+  } \
+  if (VT.horz[0].variance < threshold && VT.horz[1].variance < threshold ) { \
+    set_block_size(cm, m, get_subsize(BLOCKSIZE, PARTITION_HORZ), mis, R, C); \
+    ACTION; \
+  } \
+  if (VT.vert[0].variance < threshold && VT.vert[1].variance < threshold ) { \
+    set_block_size(cm, m, get_subsize(BLOCKSIZE, PARTITION_VERT), mis, R, C); \
+    ACTION; \
+  }
+
+static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
+                                int mi_col) {
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK *x = &cpi->mb;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  const int mis = cm->mode_info_stride;
+  // TODO(JBB): More experimentation or testing of this threshold;
+  int64_t threshold = 4;
+  int i, j, k;
+  v64x64 vt;
+  unsigned char * s;
+  int sp;
+  const unsigned char * d = xd->plane[0].pre->buf;
+  int dp = xd->plane[0].pre->stride;
+
+  set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_SB64X64);
+  s = x->plane[0].src.buf;
+  sp = x->plane[0].src.stride;
+
+  // TODO(JBB): Clearly the higher the quantizer the fewer partitions we want
+  // but this needs more experimentation.
+  threshold = threshold * cpi->common.base_qindex * cpi->common.base_qindex;
+
+  // if ( cm->frame_type == KEY_FRAME ) {
+  d = vp9_64x64_zeros;
+  dp = 64;
+  // }
+  // Fill in the entire tree of 8x8 variances for splits.
+  for (i = 0; i < 4; i++) {
+    const int x32_idx = ((i & 1) << 5);
+    const int y32_idx = ((i >> 1) << 5);
+    for (j = 0; j < 4; j++) {
+      const int x_idx = x32_idx + ((j & 1) << 4);
+      const int y_idx = y32_idx + ((j >> 1) << 4);
+      fill_16x16_variance(s + y_idx * sp + x_idx, sp, d + y_idx * dp + x_idx,
+                          dp, &vt.split[i].split[j]);
+    }
+  }
+  // Fill the rest of the variance tree by summing the split partition
+  // values.
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      fill_variance_tree(&vt.split[i].split[j])
+    }
+    fill_variance_tree(&vt.split[i])
+  }
+  fill_variance_tree(&vt)
+
+  // Now go through the entire structure,  splitting every blocksize until
+  // we get to one that's got a variance lower than our threshold,  or we
+  // hit 8x8.
+  set_vt_size( vt, BLOCK_SIZE_SB64X64, mi_row, mi_col, return);
+  for (i = 0; i < 4; ++i) {
+    const int x32_idx = ((i & 1) << 2);
+    const int y32_idx = ((i >> 1) << 2);
+    set_vt_size(vt, BLOCK_SIZE_SB32X32, mi_row + y32_idx, mi_col + x32_idx,
+                continue);
+
+    for (j = 0; j < 4; ++j) {
+      const int x16_idx = ((j & 1) << 1);
+      const int y16_idx = ((j >> 1) << 1);
+      set_vt_size(vt, BLOCK_SIZE_MB16X16, mi_row + y32_idx + y16_idx,
+                  mi_col+x32_idx+x16_idx, continue);
+
+      for (k = 0; k < 4; ++k) {
+        const int x8_idx = (k & 1);
+        const int y8_idx = (k >> 1);
+        set_block_size(cm, m, BLOCK_SIZE_SB8X8, mis,
+                       mi_row + y32_idx + y16_idx + y8_idx,
+                       mi_col + x32_idx + x16_idx + x8_idx);
+      }
+    }
+  }
+}
+static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
+                             int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize,
+                             int *rate, int *dist) {
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  const int mis = cm->mode_info_stride;
+  int bwl, bhl;
+  int bsl = b_width_log2(bsize);
+  int bs = (1 << bsl);
+  int bss = (1 << bsl)/4;
+  int i, pl;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE_TYPE subsize;
+  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+  PARTITION_CONTEXT sl[8], sa[8];
+  int r = 0, d = 0;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+
+  bwl = b_width_log2(m->mbmi.sb_type);
+  bhl = b_height_log2(m->mbmi.sb_type);
+
+  // parse the partition type
+  if ((bwl == bsl) && (bhl == bsl))
+    partition = PARTITION_NONE;
+  else if ((bwl == bsl) && (bhl < bsl))
+    partition = PARTITION_HORZ;
+  else if ((bwl < bsl) && (bhl == bsl))
+    partition = PARTITION_VERT;
+  else if ((bwl < bsl) && (bhl < bsl))
+    partition = PARTITION_SPLIT;
+  else
+    assert(0);
+
+  subsize = get_subsize(bsize, partition);
+
+  // TODO(JBB): this restriction is here because pick_sb_modes can return
+  // r's that are INT_MAX meaning we can't select a mode / mv for this block.
+  // when the code is made to work for less than sb8x8 we need to come up with
+  // a solution to this problem.
+  assert(subsize >= BLOCK_SIZE_SB8X8);
+
+  if (bsize >= BLOCK_SIZE_SB8X8) {
+    xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
+    xd->above_seg_context = cm->above_seg_context + mi_col;
+    *(get_sb_partitioning(x, bsize)) = subsize;
+  }
+
+  pl = partition_plane_context(xd, bsize);
+  save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  switch (partition) {
+    case PARTITION_NONE:
+      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
+                    get_block_context(x, bsize));
+      r += x->partition_cost[pl][PARTITION_NONE];
+      break;
+    case PARTITION_HORZ:
+      *(get_sb_index(xd, subsize)) = 0;
+      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,
+                    get_block_context(x, subsize));
+      if (mi_row + (bs >> 1) <= cm->mi_rows) {
+        int rt, dt;
+        update_state(cpi, get_block_context(x, subsize), subsize, 0);
+        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+        *(get_sb_index(xd, subsize)) = 1;
+        pick_sb_modes(cpi, mi_row + (bs >> 2), mi_col, tp, &rt, &dt, subsize,
+                      get_block_context(x, subsize));
+        r += rt;
+        d += dt;
+      }
+      set_partition_seg_context(cm, xd, mi_row, mi_col);
+      pl = partition_plane_context(xd, bsize);
+      r += x->partition_cost[pl][PARTITION_HORZ];
+      break;
+    case PARTITION_VERT:
+      *(get_sb_index(xd, subsize)) = 0;
+      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,
+                    get_block_context(x, subsize));
+      if (mi_col + (bs >> 1) <= cm->mi_cols) {
+        int rt, dt;
+        update_state(cpi, get_block_context(x, subsize), subsize, 0);
+        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+        *(get_sb_index(xd, subsize)) = 1;
+        pick_sb_modes(cpi, mi_row, mi_col + (bs >> 2), tp, &rt, &dt, subsize,
+                      get_block_context(x, subsize));
+        r += rt;
+        d += dt;
+      }
+      set_partition_seg_context(cm, xd, mi_row, mi_col);
+      pl = partition_plane_context(xd, bsize);
+      r += x->partition_cost[pl][PARTITION_VERT];
+      restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+      break;
+    case PARTITION_SPLIT:
+      for (i = 0; i < 4; i++) {
+        int x_idx = (i & 1) * (bs >> 2);
+        int y_idx = (i >> 1) * (bs >> 2);
+        int jj = i >> 1, ii = i & 0x01;
+        int rt, dt;
+
+        if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+          continue;
+
+        *(get_sb_index(xd, subsize)) = i;
+
+        rd_use_partition(cpi, m + jj * bss * mis + ii * bss, tp, mi_row + y_idx,
+                         mi_col + x_idx, subsize, &rt, &dt);
+        r += rt;
+        d += dt;
+      }
+      set_partition_seg_context(cm, xd, mi_row, mi_col);
+      pl = partition_plane_context(xd, bsize);
+      r += x->partition_cost[pl][PARTITION_SPLIT];
+      break;
+    default:
+      assert(0);
+  }
+
+  // update partition context
+#if CONFIG_AB4X4
+  if (bsize >= BLOCK_SIZE_SB8X8 &&
+      (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {
+#else
+  if (bsize > BLOCK_SIZE_SB8X8
+      && (bsize == BLOCK_SIZE_MB16X16 || partition != PARTITION_SPLIT)) {
+#endif
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    update_partition_context(xd, subsize, bsize);
+  }
+  restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+
+  if (r < INT_MAX && d < INT_MAX)
+    encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
+  *rate = r;
+  *dist = d;
+}
+
+
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previously rate-distortion optimization
 // results, for encoding speed-up.
@@ -873,7 +1236,7 @@
   ENTROPY_CONTEXT   l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
   TOKENEXTRA *tp_orig = *tp;
-  int i, p, pl;
+  int i, pl;
   BLOCK_SIZE_TYPE subsize;
   int srate = INT_MAX, sdist = INT_MAX;
 
@@ -885,19 +1248,7 @@
     }
   assert(mi_height_log2(bsize) == mi_width_log2(bsize));
 
-  // buffer the above/left context information of the block in search.
-  for (p = 0; p < MAX_MB_PLANE; ++p) {
-    vpx_memcpy(a + bs * p, cm->above_context[p] +
-               (mi_col * 2 >> xd->plane[p].subsampling_x),
-               sizeof(ENTROPY_CONTEXT) * bs >> xd->plane[p].subsampling_x);
-    vpx_memcpy(l + bs * p, cm->left_context[p] +
-               ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
-               sizeof(ENTROPY_CONTEXT) * bs >> xd->plane[p].subsampling_y);
-  }
-  vpx_memcpy(sa, cm->above_seg_context + mi_col,
-             sizeof(PARTITION_CONTEXT) * ms);
-  vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),
-             sizeof(PARTITION_CONTEXT) * ms);
+  save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
   // PARTITION_SPLIT
   if (bsize >= BLOCK_SIZE_SB8X8) {
@@ -1025,6 +1376,8 @@
   *rate = srate;
   *dist = sdist;
 
+  restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+
   if (srate < INT_MAX && sdist < INT_MAX)
     encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
 
@@ -1050,8 +1403,22 @@
   for (mi_col = cm->cur_tile_mi_col_start;
        mi_col < cm->cur_tile_mi_col_end; mi_col += 8) {
     int dummy_rate, dummy_dist;
-    rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
-                      &dummy_rate, &dummy_dist);
+    // TODO(JBB): remove the border conditions for 64x64 blocks once its fixed
+    // without this border check choose will fail on the border of every
+    // non 64x64.
+    if (cpi->speed < 5 ||
+        mi_col + 8 > cm->cur_tile_mi_col_end ||
+        mi_row + 8 > cm->cur_tile_mi_row_end) {
+      rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+                        &dummy_rate, &dummy_dist);
+    } else {
+      const int idx_str = cm->mode_info_stride * mi_row + mi_col;
+      MODE_INFO *m = cm->mi + idx_str;
+      // set_partitioning(cpi, m, BLOCK_SIZE_SB8X8);
+      choose_partitioning(cpi, cm->mi, mi_row, mi_col);
+      rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+                       &dummy_rate, &dummy_dist);
+    }
   }
 }
 
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -696,6 +696,25 @@
   sf->thresh_mult[THR_COMP_SPLITGA  ] += speed_multiplier * 4500;
   sf->thresh_mult[THR_COMP_SPLITLG  ] += speed_multiplier * 4500;
 
+  if (speed > 4) {
+    for (i = 0; i < MAX_MODES; ++i)
+      sf->thresh_mult[i] = INT_MAX;
+
+    sf->thresh_mult[THR_DC       ] = 0;
+    sf->thresh_mult[THR_TM       ] = 0;
+    sf->thresh_mult[THR_NEWMV    ] = 4000;
+    sf->thresh_mult[THR_NEWG     ] = 4000;
+    sf->thresh_mult[THR_NEWA     ] = 4000;
+    sf->thresh_mult[THR_NEARESTMV] = 0;
+    sf->thresh_mult[THR_NEARESTG ] = 0;
+    sf->thresh_mult[THR_NEARESTA ] = 0;
+    sf->thresh_mult[THR_NEARMV   ] = 2000;
+    sf->thresh_mult[THR_NEARG    ] = 2000;
+    sf->thresh_mult[THR_NEARA    ] = 2000;
+    sf->thresh_mult[THR_COMP_NEARESTLA] = 2000;
+    sf->recode_loop = 0;
+  }
+
   /* disable frame modes if flags not set */
   if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {
     sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
@@ -804,48 +823,6 @@
 #endif
 #endif
       sf->mb16_breakout = 0;
-
-      if (speed > 0) {
-        /* Disable coefficient optimization above speed 0 */
-        sf->optimize_coefficients = 0;
-        sf->no_skip_block4x4_search = 0;
-        sf->comp_inter_joint_search = 0;
-
-        sf->first_step = 1;
-
-        cpi->mode_check_freq[THR_SPLITG] = 2;
-        cpi->mode_check_freq[THR_SPLITA] = 2;
-        cpi->mode_check_freq[THR_SPLITMV] = 0;
-
-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 2;
-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 2;
-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 0;
-      }
-
-      if (speed > 1) {
-        cpi->mode_check_freq[THR_SPLITG] = 4;
-        cpi->mode_check_freq[THR_SPLITA] = 4;
-        cpi->mode_check_freq[THR_SPLITMV] = 2;
-
-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 4;
-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 4;
-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 2;
-      }
-
-      if (speed > 2) {
-        cpi->mode_check_freq[THR_SPLITG] = 15;
-        cpi->mode_check_freq[THR_SPLITA] = 15;
-        cpi->mode_check_freq[THR_SPLITMV] = 7;
-
-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 15;
-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 15;
-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 7;
-
-        // Only do recode loop on key frames, golden frames and
-        // alt ref frames
-        sf->recode_loop = 2;
-      }
-
       break;
 
   }; /* switch */
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -620,9 +620,25 @@
                             int64_t txfm_cache[NB_TXFM_MODES]) {
   VP9_COMMON *const cm = &cpi->common;
   int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
   vp9_subtract_sby(x, bs);
 
+  if (cpi->speed > 4) {
+    if (bs >= BLOCK_SIZE_SB32X32) {
+      mbmi->txfm_size = TX_32X32;
+    } else if (bs >= BLOCK_SIZE_MB16X16) {
+      mbmi->txfm_size = TX_16X16;
+    } else if (bs >= BLOCK_SIZE_SB8X8) {
+      mbmi->txfm_size = TX_8X8;
+    } else {
+      mbmi->txfm_size = TX_4X4;
+    }
+    super_block_yrd_for_txfm(cm, x, rate, distortion, skip, bs,
+                             mbmi->txfm_size);
+    return;
+  }
   if (bs >= BLOCK_SIZE_SB32X32)
     super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
                              bs, TX_32X32);
@@ -842,7 +858,7 @@
                                       int64_t txfm_cache[NB_TXFM_MODES]) {
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-  MACROBLOCKD *xd = &x->e_mbd;
+  MACROBLOCKD *const xd = &x->e_mbd;
   int this_rate, this_rate_tokenonly;
   int this_distortion, s;
   int64_t best_rd = INT64_MAX, this_rd;
@@ -863,7 +879,6 @@
     int64_t local_txfm_cache[NB_TXFM_MODES];
     MODE_INFO *const mic = xd->mode_info_context;
     const int mis = xd->mode_info_stride;
-
     if (cpi->common.frame_type == KEY_FRAME) {
       const MB_PREDICTION_MODE A = above_block_mode(mic, 0, mis);
       const MB_PREDICTION_MODE L = xd->left_available ?
@@ -871,12 +886,12 @@
 
       bmode_costs = x->y_mode_costs[A][L];
     }
-
     x->e_mbd.mode_info_context->mbmi.mode = mode;
     vp9_build_intra_predictors_sby_s(&x->e_mbd, bsize);
 
     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
                     bsize, local_txfm_cache);
+
     this_rate = this_rate_tokenonly + bmode_costs[mode];
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
@@ -2273,7 +2288,9 @@
                  (mbmi->mv[1].as_mv.col & 15) == 0;
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
-  if (1) {
+  if (cpi->speed > 4) {
+    *best_filter = EIGHTTAP;
+  } else {
     int i, newbest;
     int tmp_rate_sum = 0, tmp_dist_sum = 0;
     for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
@@ -2410,6 +2427,7 @@
     // Y cost and distortion
     super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y,
                     bsize, txfm_cache);
+
     *rate2 += *rate_y;
     *distortion += *distortion_y;
 
--- a/vp9/encoder/vp9_variance_c.c
+++ b/vp9/encoder/vp9_variance_c.c
@@ -318,6 +318,11 @@
   return (var - (((unsigned int)avg * avg) >> 7));
 }
 
+void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride,
+                       const uint8_t *ref_ptr, int ref_stride,
+                       unsigned int *sse, int *sum) {
+  variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);
+}
 
 unsigned int vp9_variance8x8_c(const uint8_t *src_ptr,
                                int  source_stride,