shithub: libvpx

Download patch

ref: 03b412d0449146ecd7e3398448cfa91c2acca05e
parent: a43bdcd7b021d7aa091a516ac313930b3d28fe6e
parent: d0ed677a34d4778d96ee4c31d04e153b52f14394
author: John Koleszar <[email protected]>
date: Tue Jun 11 14:19:14 EDT 2013

VP9 profile 0 release candidate

Merge experimental branch into master

Change-Id: Ie5f89fb977d28a4d98a8dcdf1c6eb97271a3c1db

--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -103,6 +103,18 @@
 .PHONY: testdata
 testdata::
 
+# Add compiler flags for intrinsic files
+$(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx
+$(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx
+$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2
+$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2
+$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3
+$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3
+$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3
+$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3
+$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1
+$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1
+
 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")
 	$(qexec)mkdir -p $(dir $@)
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -266,12 +266,13 @@
 fi
 TMP_H="${TMPDIRx}/vpx-conf-$$-${RANDOM}.h"
 TMP_C="${TMPDIRx}/vpx-conf-$$-${RANDOM}.c"
+TMP_CC="${TMPDIRx}/vpx-conf-$$-${RANDOM}.cc"
 TMP_O="${TMPDIRx}/vpx-conf-$$-${RANDOM}.o"
 TMP_X="${TMPDIRx}/vpx-conf-$$-${RANDOM}.x"
 TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RANDOM}.asm"
 
 clean_temp_files() {
-    rm -f ${TMP_C} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}
+    rm -f ${TMP_C} ${TMP_CC} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}
 }
 
 #
@@ -292,9 +293,9 @@
 
 check_cxx() {
     log check_cxx "$@"
-    cat >${TMP_C}
-    log_file ${TMP_C}
-    check_cmd ${CXX} ${CXXFLAGS} "$@" -c -o ${TMP_O} ${TMP_C}
+    cat >${TMP_CC}
+    log_file ${TMP_CC}
+    check_cmd ${CXX} ${CXXFLAGS} "$@" -c -o ${TMP_O} ${TMP_CC}
 }
 
 check_cpp() {
@@ -1071,7 +1072,7 @@
                 tune_cflags="-march="
                 setup_gnu_toolchain
                 #for 32 bit x86 builds, -O3 did not turn on this flag
-                enabled optimizations && check_add_cflags -fomit-frame-pointer
+                enabled optimizations && disabled gprof && check_add_cflags -fomit-frame-pointer
             ;;
             vs*)
                 # When building with Microsoft Visual Studio the assembler is
--- a/configure
+++ b/configure
@@ -243,19 +243,11 @@
     unistd_h
 "
 EXPERIMENT_LIST="
-    csm
-    new_mvref
-    implicit_segmentation
-    newbintramodes
-    comp_interintra_pred
-    enable_6tap
-    abovesprefmv
-    code_nonzerocount
-    useselectrefmv
-    modelcoefprob
-    loop_dering
-    implicit_compoundinter_weight
-    scatterscan
+    oneshotq
+    multiple_arf
+    non420
+    alpha
+    balanced_coeftree
 "
 CONFIG_LIST="
     external_build
@@ -608,7 +600,10 @@
         check_add_cflags -Wimplicit-function-declaration
         check_add_cflags -Wuninitialized
         check_add_cflags -Wunused-variable
-        check_add_cflags -Wunused-but-set-variable
+        case ${CC} in
+          *clang*) ;;
+          *) check_add_cflags -Wunused-but-set-variable ;;
+        esac
         enabled extra_warnings || check_add_cflags -Wno-unused-function
     fi
 
--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -34,6 +34,13 @@
     return (value >> 24) & 0xff;
   }
 
+  uint8_t Rand8Extremes(void) {
+    // Returns a random value near 0 or near 255, to better exercise
+    // saturation behavior.
+    const uint8_t r = Rand8();
+    return r < 128 ? r << 4 : r >> 4;
+  }
+
   int PseudoUniform(int range) {
     return random_.Generate(range);
   }
--- /dev/null
+++ b/test/borders_test.cc
@@ -1,0 +1,86 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class BordersTest : public ::libvpx_test::EncoderTest,
+    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  BordersTest() : EncoderTest(GET_PARAM(0)) {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+  }
+
+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if ( video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, 0);
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+      encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+    }
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
+    }
+  }
+};
+
+TEST_P(BordersTest, TestEncodeHighBitrate) {
+  // Validate that this non multiple of 64 wide clip encodes and decodes
+  // without a mismatch when passing in a very low max q.  This pushes
+  // the encoder to producing lots of big partitions which will likely
+  // extend into the border and test the border condition.
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.rc_max_quantizer = 10;
+
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       40);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+TEST_P(BordersTest, TestLowBitrate) {
+  // Validate that this clip encodes and decodes without a mismatch
+  // when passing in a very high min q.  This pushes the encoder to producing
+  // lots of small partitions which might will test the other condition.
+
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.rc_min_quantizer = 40;
+
+  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       40);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+VP9_INSTANTIATE_TEST_CASE(BordersTest, ::testing::Values(
+    ::libvpx_test::kTwoPassGood));
+}  // namespace
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -8,6 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
 
 extern "C" {
 #include "./vpx_config.h"
@@ -16,10 +20,6 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 }
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/acm_random.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
 
 namespace {
 typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
@@ -46,20 +46,20 @@
 // Reference 8-tap subpixel filter, slightly modified to fit into this test.
 #define VP9_FILTER_WEIGHT 128
 #define VP9_FILTER_SHIFT 7
-static uint8_t clip_pixel(int x) {
+uint8_t clip_pixel(int x) {
   return x < 0 ? 0 :
          x > 255 ? 255 :
          x;
 }
 
-static void filter_block2d_8_c(const uint8_t *src_ptr,
-                               const unsigned int src_stride,
-                               const int16_t *HFilter,
-                               const int16_t *VFilter,
-                               uint8_t *dst_ptr,
-                               unsigned int dst_stride,
-                               unsigned int output_width,
-                               unsigned int output_height) {
+void filter_block2d_8_c(const uint8_t *src_ptr,
+                        const unsigned int src_stride,
+                        const int16_t *HFilter,
+                        const int16_t *VFilter,
+                        uint8_t *dst_ptr,
+                        unsigned int dst_stride,
+                        unsigned int output_width,
+                        unsigned int output_height) {
   // Between passes, we use an intermediate buffer whose height is extended to
   // have enough horizontally filtered values as input for the vertical pass.
   // This buffer is allocated to be big enough for the largest block type we
@@ -66,7 +66,7 @@
   // support.
   const int kInterp_Extend = 4;
   const unsigned int intermediate_height =
-    (kInterp_Extend - 1) +     output_height + kInterp_Extend;
+      (kInterp_Extend - 1) + output_height + kInterp_Extend;
 
   /* Size of intermediate_buffer is max_intermediate_height * filter_max_width,
    * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height
@@ -75,7 +75,7 @@
    *                               = 23
    * and filter_max_width = 16
    */
-  uint8_t intermediate_buffer[23 * 16];
+  uint8_t intermediate_buffer[71 * 64];
   const int intermediate_next_stride = 1 - intermediate_height * output_width;
 
   // Horizontal pass (src -> transposed intermediate).
@@ -87,15 +87,15 @@
     for (i = 0; i < intermediate_height; ++i) {
       for (j = 0; j < output_width; ++j) {
         // Apply filter...
-        int temp = ((int)src_ptr[0] * HFilter[0]) +
-                   ((int)src_ptr[1] * HFilter[1]) +
-                   ((int)src_ptr[2] * HFilter[2]) +
-                   ((int)src_ptr[3] * HFilter[3]) +
-                   ((int)src_ptr[4] * HFilter[4]) +
-                   ((int)src_ptr[5] * HFilter[5]) +
-                   ((int)src_ptr[6] * HFilter[6]) +
-                   ((int)src_ptr[7] * HFilter[7]) +
-                   (VP9_FILTER_WEIGHT >> 1);  // Rounding
+        const int temp = (src_ptr[0] * HFilter[0]) +
+                         (src_ptr[1] * HFilter[1]) +
+                         (src_ptr[2] * HFilter[2]) +
+                         (src_ptr[3] * HFilter[3]) +
+                         (src_ptr[4] * HFilter[4]) +
+                         (src_ptr[5] * HFilter[5]) +
+                         (src_ptr[6] * HFilter[6]) +
+                         (src_ptr[7] * HFilter[7]) +
+                         (VP9_FILTER_WEIGHT >> 1);  // Rounding
 
         // Normalize back to 0-255...
         *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT);
@@ -115,15 +115,15 @@
     for (i = 0; i < output_height; ++i) {
       for (j = 0; j < output_width; ++j) {
         // Apply filter...
-        int temp = ((int)src_ptr[0] * VFilter[0]) +
-                   ((int)src_ptr[1] * VFilter[1]) +
-                   ((int)src_ptr[2] * VFilter[2]) +
-                   ((int)src_ptr[3] * VFilter[3]) +
-                   ((int)src_ptr[4] * VFilter[4]) +
-                   ((int)src_ptr[5] * VFilter[5]) +
-                   ((int)src_ptr[6] * VFilter[6]) +
-                   ((int)src_ptr[7] * VFilter[7]) +
-                   (VP9_FILTER_WEIGHT >> 1);  // Rounding
+        const int temp = (src_ptr[0] * VFilter[0]) +
+                         (src_ptr[1] * VFilter[1]) +
+                         (src_ptr[2] * VFilter[2]) +
+                         (src_ptr[3] * VFilter[3]) +
+                         (src_ptr[4] * VFilter[4]) +
+                         (src_ptr[5] * VFilter[5]) +
+                         (src_ptr[6] * VFilter[6]) +
+                         (src_ptr[7] * VFilter[7]) +
+                         (VP9_FILTER_WEIGHT >> 1);  // Rounding
 
         // Normalize back to 0-255...
         *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT);
@@ -135,12 +135,12 @@
   }
 }
 
-static void block2d_average_c(uint8_t *src,
-                              unsigned int src_stride,
-                              uint8_t *output_ptr,
-                              unsigned int output_stride,
-                              unsigned int output_width,
-                              unsigned int output_height) {
+void block2d_average_c(uint8_t *src,
+                       unsigned int src_stride,
+                       uint8_t *output_ptr,
+                       unsigned int output_stride,
+                       unsigned int output_width,
+                       unsigned int output_height) {
   unsigned int i, j;
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; ++j) {
@@ -150,21 +150,21 @@
   }
 }
 
-static void filter_average_block2d_8_c(const uint8_t *src_ptr,
-                                       const unsigned int src_stride,
-                                       const int16_t *HFilter,
-                                       const int16_t *VFilter,
-                                       uint8_t *dst_ptr,
-                                       unsigned int dst_stride,
-                                       unsigned int output_width,
-                                       unsigned int output_height) {
-  uint8_t tmp[16*16];
+void filter_average_block2d_8_c(const uint8_t *src_ptr,
+                                const unsigned int src_stride,
+                                const int16_t *HFilter,
+                                const int16_t *VFilter,
+                                uint8_t *dst_ptr,
+                                unsigned int dst_stride,
+                                unsigned int output_width,
+                                unsigned int output_height) {
+  uint8_t tmp[64 * 64];
 
-  assert(output_width <= 16);
-  assert(output_height <= 16);
-  filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 16,
+  assert(output_width <= 64);
+  assert(output_height <= 64);
+  filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
                      output_width, output_height);
-  block2d_average_c(tmp, 16, dst_ptr, dst_stride,
+  block2d_average_c(tmp, 64, dst_ptr, dst_stride,
                     output_width, output_height);
 }
 
@@ -173,10 +173,9 @@
   static void SetUpTestCase() {
     // Force input_ to be unaligned, output to be 16 byte aligned.
     input_ = reinterpret_cast<uint8_t*>(
-        vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize + 1))
-        + 1;
+        vpx_memalign(kDataAlignment, kInputBufferSize + 1)) + 1;
     output_ = reinterpret_cast<uint8_t*>(
-        vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize));
+        vpx_memalign(kDataAlignment, kOutputBufferSize));
   }
 
   static void TearDownTestCase() {
@@ -186,62 +185,63 @@
     output_ = NULL;
   }
 
-  protected:
-    static const int kDataAlignment = 16;
-    static const int kOuterBlockSize = 32;
-    static const int kInputStride = kOuterBlockSize;
-    static const int kOutputStride = kOuterBlockSize;
-    static const int kMaxDimension = 16;
+ protected:
+  static const int kDataAlignment = 16;
+  static const int kOuterBlockSize = 128;
+  static const int kInputStride = kOuterBlockSize;
+  static const int kOutputStride = kOuterBlockSize;
+  static const int kMaxDimension = 64;
+  static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize;
+  static const int kOutputBufferSize = kOuterBlockSize * kOuterBlockSize;
 
-    int Width() const { return GET_PARAM(0); }
-    int Height() const { return GET_PARAM(1); }
-    int BorderLeft() const {
-      const int center = (kOuterBlockSize - Width()) / 2;
-      return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);
-    }
-    int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }
+  int Width() const { return GET_PARAM(0); }
+  int Height() const { return GET_PARAM(1); }
+  int BorderLeft() const {
+    const int center = (kOuterBlockSize - Width()) / 2;
+    return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);
+  }
+  int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }
 
-    bool IsIndexInBorder(int i) {
-      return (i < BorderTop() * kOuterBlockSize ||
-              i >= (BorderTop() + Height()) * kOuterBlockSize ||
-              i % kOuterBlockSize < BorderLeft() ||
-              i % kOuterBlockSize >= (BorderLeft() + Width()));
-    }
+  bool IsIndexInBorder(int i) {
+    return (i < BorderTop() * kOuterBlockSize ||
+            i >= (BorderTop() + Height()) * kOuterBlockSize ||
+            i % kOuterBlockSize < BorderLeft() ||
+            i % kOuterBlockSize >= (BorderLeft() + Width()));
+  }
 
-    virtual void SetUp() {
-      UUT_ = GET_PARAM(2);
-      memset(input_, 0, sizeof(input_));
-      /* Set up guard blocks for an inner block cetered in the outer block */
-      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i) {
-        if (IsIndexInBorder(i))
-          output_[i] = 255;
-        else
-          output_[i] = 0;
-      }
-
-      ::libvpx_test::ACMRandom prng;
-      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i)
-        input_[i] = prng.Rand8();
+  virtual void SetUp() {
+    UUT_ = GET_PARAM(2);
+    /* Set up guard blocks for an inner block cetered in the outer block */
+    for (int i = 0; i < kOutputBufferSize; ++i) {
+      if (IsIndexInBorder(i))
+        output_[i] = 255;
+      else
+        output_[i] = 0;
     }
 
-    void CheckGuardBlocks() {
-      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i) {
-        if (IsIndexInBorder(i))
-          EXPECT_EQ(255, output_[i]);
-      }
-    }
+    ::libvpx_test::ACMRandom prng;
+    for (int i = 0; i < kInputBufferSize; ++i)
+      input_[i] = prng.Rand8Extremes();
+  }
 
-    uint8_t* input() {
-      return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+  void CheckGuardBlocks() {
+    for (int i = 0; i < kOutputBufferSize; ++i) {
+      if (IsIndexInBorder(i))
+        EXPECT_EQ(255, output_[i]);
     }
+  }
 
-    uint8_t* output() {
-      return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
-    }
+  uint8_t* input() const {
+    return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+  }
 
-    const ConvolveFunctions* UUT_;
-    static uint8_t* input_;
-    static uint8_t* output_;
+  uint8_t* output() const {
+    return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
+  }
+
+  const ConvolveFunctions* UUT_;
+  static uint8_t* input_;
+  static uint8_t* output_;
 };
 uint8_t* ConvolveTest::input_ = NULL;
 uint8_t* ConvolveTest::output_ = NULL;
@@ -303,12 +303,34 @@
 
 const int16_t (*kTestFilterList[])[8] = {
   vp9_bilinear_filters,
-  vp9_sub_pel_filters_6,
   vp9_sub_pel_filters_8,
   vp9_sub_pel_filters_8s,
   vp9_sub_pel_filters_8lp
 };
+const int kNumFilterBanks = sizeof(kTestFilterList) /
+                            sizeof(kTestFilterList[0]);
+const int kNumFilters = 16;
 
+TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
+  for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
+    const int16_t (*filters)[8] = kTestFilterList[filter_bank];
+    for (int i = 0; i < kNumFilters; i++) {
+      const int p0 = filters[i][0] + filters[i][1];
+      const int p1 = filters[i][2] + filters[i][3];
+      const int p2 = filters[i][4] + filters[i][5];
+      const int p3 = filters[i][6] + filters[i][7];
+      EXPECT_LE(p0, 128);
+      EXPECT_LE(p1, 128);
+      EXPECT_LE(p2, 128);
+      EXPECT_LE(p3, 128);
+      EXPECT_LE(p0 + p3, 128);
+      EXPECT_LE(p0 + p3 + p1, 128);
+      EXPECT_LE(p0 + p3 + p1 + p2, 128);
+      EXPECT_EQ(p0 + p1 + p2 + p3, 128);
+    }
+  }
+}
+
 const int16_t kInvalidFilter[8] = { 0 };
 
 TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
@@ -316,12 +338,9 @@
   uint8_t* const out = output();
   uint8_t ref[kOutputStride * kMaxDimension];
 
-  const int kNumFilterBanks = sizeof(kTestFilterList) /
-      sizeof(kTestFilterList[0]);
 
   for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
     const int16_t (*filters)[8] = kTestFilterList[filter_bank];
-    const int kNumFilters = 16;
 
     for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
       for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
@@ -368,7 +387,7 @@
   ::libvpx_test::ACMRandom prng;
   for (int y = 0; y < Height(); ++y) {
     for (int x = 0; x < Width(); ++x) {
-      const uint8_t r = prng.Rand8();
+      const uint8_t r = prng.Rand8Extremes();
 
       out[y * kOutputStride + x] = r;
       ref[y * kOutputStride + x] = r;
@@ -440,6 +459,7 @@
 TEST_P(ConvolveTest, ChangeFilterWorks) {
   uint8_t* const in = input();
   uint8_t* const out = output();
+  const int kPixelSelected = 4;
 
   REGISTER_STATE_CHECK(UUT_->h8_(in, kInputStride, out, kOutputStride,
                                  kChangeFilters[8], 17, kChangeFilters[4], 16,
@@ -446,10 +466,10 @@
                                  Width(), Height()));
 
   for (int x = 0; x < Width(); ++x) {
-    if (x < 8)
-      ASSERT_EQ(in[4], out[x]) << "x == " << x;
-    else
-      ASSERT_EQ(in[12], out[x]) << "x == " << x;
+    const int kQ4StepAdjust = x >> 4;
+    const int kFilterPeriodAdjust = (x >> 3) << 3;
+    const int ref_x = kQ4StepAdjust + kFilterPeriodAdjust + kPixelSelected;
+    ASSERT_EQ(in[ref_x], out[x]) << "x == " << x;
   }
 
   REGISTER_STATE_CHECK(UUT_->v8_(in, kInputStride, out, kOutputStride,
@@ -457,10 +477,10 @@
                                  Width(), Height()));
 
   for (int y = 0; y < Height(); ++y) {
-    if (y < 8)
-      ASSERT_EQ(in[4 * kInputStride], out[y * kOutputStride]) << "y == " << y;
-    else
-      ASSERT_EQ(in[12 * kInputStride], out[y * kOutputStride]) << "y == " << y;
+    const int kQ4StepAdjust = y >> 4;
+    const int kFilterPeriodAdjust = (y >> 3) << 3;
+    const int ref_y = kQ4StepAdjust + kFilterPeriodAdjust + kPixelSelected;
+    ASSERT_EQ(in[ref_y * kInputStride], out[y * kInputStride]) << "y == " << y;
   }
 
   REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
@@ -468,9 +488,13 @@
                                   Width(), Height()));
 
   for (int y = 0; y < Height(); ++y) {
+    const int kQ4StepAdjustY = y >> 4;
+    const int kFilterPeriodAdjustY = (y >> 3) << 3;
+    const int ref_y = kQ4StepAdjustY + kFilterPeriodAdjustY + kPixelSelected;
     for (int x = 0; x < Width(); ++x) {
-      const int ref_x = x < 8 ? 4 : 12;
-      const int ref_y = y < 8 ? 4 : 12;
+      const int kQ4StepAdjustX = x >> 4;
+      const int kFilterPeriodAdjustX = (x >> 3) << 3;
+      const int ref_x = kQ4StepAdjustX + kFilterPeriodAdjustX + kPixelSelected;
 
       ASSERT_EQ(in[ref_y * kInputStride + ref_x], out[y * kOutputStride + x])
           << "x == " << x << ", y == " << y;
@@ -489,10 +513,17 @@
 INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
     make_tuple(4, 4, &convolve8_c),
     make_tuple(8, 4, &convolve8_c),
+    make_tuple(4, 8, &convolve8_c),
     make_tuple(8, 8, &convolve8_c),
     make_tuple(16, 8, &convolve8_c),
-    make_tuple(16, 16, &convolve8_c)));
-}
+    make_tuple(8, 16, &convolve8_c),
+    make_tuple(16, 16, &convolve8_c),
+    make_tuple(32, 16, &convolve8_c),
+    make_tuple(16, 32, &convolve8_c),
+    make_tuple(32, 32, &convolve8_c),
+    make_tuple(64, 32, &convolve8_c),
+    make_tuple(32, 64, &convolve8_c),
+    make_tuple(64, 64, &convolve8_c)));
 
 #if HAVE_SSSE3
 const ConvolveFunctions convolve8_ssse3(
@@ -503,7 +534,16 @@
 INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
     make_tuple(4, 4, &convolve8_ssse3),
     make_tuple(8, 4, &convolve8_ssse3),
+    make_tuple(4, 8, &convolve8_ssse3),
     make_tuple(8, 8, &convolve8_ssse3),
     make_tuple(16, 8, &convolve8_ssse3),
-    make_tuple(16, 16, &convolve8_ssse3)));
+    make_tuple(8, 16, &convolve8_ssse3),
+    make_tuple(16, 16, &convolve8_ssse3),
+    make_tuple(32, 16, &convolve8_ssse3),
+    make_tuple(16, 32, &convolve8_ssse3),
+    make_tuple(32, 32, &convolve8_ssse3),
+    make_tuple(64, 32, &convolve8_ssse3),
+    make_tuple(32, 64, &convolve8_ssse3),
+    make_tuple(64, 64, &convolve8_ssse3)));
 #endif
+}  // namespace
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -17,6 +17,7 @@
 extern "C" {
 #include "vp9/common/vp9_entropy.h"
 #include "vp9_rtcd.h"
+void vp9_short_idct16x16_add_c(short *input, uint8_t *output, int pitch);
 }
 
 #include "acm_random.h"
@@ -269,19 +270,23 @@
   const int count_test_block = 1000;
   for (int i = 0; i < count_test_block; ++i) {
     int16_t in[256], coeff[256];
-    int16_t out_c[256];
+    uint8_t dst[256], src[256];
     double out_r[256];
 
+    for (int j = 0; j < 256; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 256; ++j)
-      in[j] = rnd.Rand8() - rnd.Rand8();
+      in[j] = src[j] - dst[j];
 
     reference_16x16_dct_2d(in, out_r);
     for (int j = 0; j < 256; j++)
       coeff[j] = round(out_r[j]);
-    vp9_short_idct16x16_c(coeff, out_c, 32);
+    vp9_short_idct16x16_add_c(coeff, dst, 16);
     for (int j = 0; j < 256; ++j) {
-      const int diff = out_c[j] - in[j];
+      const int diff = dst[j] - src[j];
       const int error = diff * diff;
       EXPECT_GE(1, error)
           << "Error: 16x16 IDCT has error " << error
@@ -289,7 +294,7 @@
     }
   }
 }
-#if 1
+
 // we need enable fdct test once we re-do the 16 point fdct.
 TEST(VP9Fdct16x16Test, AccuracyCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -299,18 +304,22 @@
   for (int i = 0; i < count_test_block; ++i) {
     int16_t test_input_block[256];
     int16_t test_temp_block[256];
-    int16_t test_output_block[256];
+    uint8_t dst[256], src[256];
 
+    for (int j = 0; j < 256; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 256; ++j)
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      test_input_block[j] = src[j] - dst[j];
 
     const int pitch = 32;
     vp9_short_fdct16x16_c(test_input_block, test_temp_block, pitch);
-    vp9_short_idct16x16_c(test_temp_block, test_output_block, pitch);
+    vp9_short_idct16x16_add_c(test_temp_block, dst, 16);
 
     for (int j = 0; j < 256; ++j) {
-      const int diff = test_input_block[j] - test_output_block[j];
+      const int diff = dst[j] - src[j];
       const int error = diff * diff;
       if (max_error < error)
         max_error = error;
@@ -354,6 +363,4 @@
     }
   }
 }
-#endif
-
 }  // namespace
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -18,7 +18,7 @@
 #include "vp9/common/vp9_entropy.h"
 #include "./vp9_rtcd.h"
   void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);
-  void vp9_short_idct32x32_c(short *input, short *output, int pitch);
+  void vp9_short_idct32x32_add_c(short *input, uint8_t *output, int pitch);
 }
 
 #include "test/acm_random.h"
@@ -91,28 +91,31 @@
   }
 }
 
-
 TEST(VP9Idct32x32Test, AccuracyCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 1000;
   for (int i = 0; i < count_test_block; ++i) {
     int16_t in[1024], coeff[1024];
-    int16_t out_c[1024];
+    uint8_t dst[1024], src[1024];
     double out_r[1024];
 
+    for (int j = 0; j < 1024; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 1024; ++j)
-      in[j] = rnd.Rand8() - rnd.Rand8();
+      in[j] = src[j] - dst[j];
 
     reference_32x32_dct_2d(in, out_r);
     for (int j = 0; j < 1024; j++)
       coeff[j] = round(out_r[j]);
-    vp9_short_idct32x32_c(coeff, out_c, 64);
+    vp9_short_idct32x32_add_c(coeff, dst, 32);
     for (int j = 0; j < 1024; ++j) {
-      const int diff = out_c[j] - in[j];
+      const int diff = dst[j] - src[j];
       const int error = diff * diff;
       EXPECT_GE(1, error)
-          << "Error: 3x32 IDCT has error " << error
+          << "Error: 32x32 IDCT has error " << error
           << " at index " << j;
     }
   }
@@ -126,18 +129,22 @@
   for (int i = 0; i < count_test_block; ++i) {
     int16_t test_input_block[1024];
     int16_t test_temp_block[1024];
-    int16_t test_output_block[1024];
+    uint8_t dst[1024], src[1024];
 
+    for (int j = 0; j < 1024; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 1024; ++j)
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      test_input_block[j] = src[j] - dst[j];
 
     const int pitch = 64;
     vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);
-    vp9_short_idct32x32_c(test_temp_block, test_output_block, pitch);
+    vp9_short_idct32x32_add_c(test_temp_block, dst, 32);
 
     for (int j = 0; j < 1024; ++j) {
-      const unsigned diff = test_input_block[j] - test_output_block[j];
+      const unsigned diff = dst[j] - src[j];
       const unsigned error = diff * diff;
       if (max_error < error)
         max_error = error;
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -10,9 +10,10 @@
 #ifndef TEST_ENCODE_TEST_DRIVER_H_
 #define TEST_ENCODE_TEST_DRIVER_H_
 
-#include "./vpx_config.h"
 #include <string>
 #include <vector>
+
+#include "./vpx_config.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "vpx/vpx_encoder.h"
 
@@ -46,7 +47,7 @@
 class CxDataIterator {
  public:
   explicit CxDataIterator(vpx_codec_ctx_t *encoder)
-    : encoder_(encoder), iter_(NULL) {}
+      : encoder_(encoder), iter_(NULL) {}
 
   const vpx_codec_cx_pkt_t *Next() {
     return vpx_codec_get_cx_data(encoder_, &iter_);
@@ -92,7 +93,7 @@
     memset(&encoder_, 0, sizeof(encoder_));
   }
 
-  ~Encoder() {
+  virtual ~Encoder() {
     vpx_codec_destroy(&encoder_);
   }
 
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -206,11 +206,17 @@
   // reset previously set error/droppable frames
   Reset();
 
+#if 0
+  // TODO(jkoleszar): This test is disabled for the time being as too
+  // sensitive. It's not clear how to set a reasonable threshold for
+  // this behavior.
+
   // Now set an arbitrary set of error frames that are non-droppable
   unsigned int num_error_frames = 3;
   unsigned int error_frame_list[] = {3, 10, 20};
   SetErrorFrames(num_error_frames, error_frame_list);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
   // Test that dropping an arbitrary set of inter frames does not hurt too much
   // Note the Average Mismatch PSNR is the average of the PSNR between
   // decoded frame and encoder's version of the same frame for all frames
@@ -219,6 +225,7 @@
   std::cout << "             Mismatch PSNR: "
             << psnr_resilience_mismatch << "\n";
   EXPECT_GT(psnr_resilience_mismatch, 20.0);
+#endif
 }
 
 VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTest, ONE_PASS_TEST_MODES);
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -96,11 +96,15 @@
   for (int i = 0; i < count_test_block; ++i) {
     int16_t test_input_block[16];
     int16_t test_temp_block[16];
-    int16_t test_output_block[16];
+    uint8_t dst[16], src[16];
 
+    for (int j = 0; j < 16; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 16; ++j)
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      test_input_block[j] = src[j] - dst[j];
 
     // TODO(Yaowu): this should be converted to a parameterized test
     // to test optimized versions of this function.
@@ -120,10 +124,10 @@
     }
 
     // Because the bitstream is not frozen yet, use the idct in the codebase.
-    vp9_short_idct4x4_c(test_temp_block, test_output_block, pitch);
+    vp9_short_idct4x4_add_c(test_temp_block, dst, 4);
 
     for (int j = 0; j < 16; ++j) {
-      const int diff = test_input_block[j] - test_output_block[j];
+      const int diff = dst[j] - src[j];
       const int error = diff * diff;
       if (max_error < error)
         max_error = error;
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -16,6 +16,7 @@
 
 extern "C" {
 #include "vp9_rtcd.h"
+void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch);
 }
 
 #include "acm_random.h"
@@ -100,11 +101,15 @@
   for (int i = 0; i < count_test_block; ++i) {
     int16_t test_input_block[64];
     int16_t test_temp_block[64];
-    int16_t test_output_block[64];
+    uint8_t dst[64], src[64];
 
+    for (int j = 0; j < 64; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 64; ++j)
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      test_input_block[j] = src[j] - dst[j];
 
     const int pitch = 16;
     vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
@@ -119,10 +124,10 @@
           test_temp_block[j] *= 4;
         }
     }
-    vp9_short_idct8x8_c(test_temp_block, test_output_block, pitch);
+    vp9_short_idct8x8_add_c(test_temp_block, dst, 8);
 
     for (int j = 0; j < 64; ++j) {
-      const int diff = test_input_block[j] - test_output_block[j];
+      const int diff = dst[j] - src[j];
       const int error = diff * diff;
       if (max_error < error)
         max_error = error;
@@ -145,18 +150,22 @@
   for (int i = 0; i < count_test_block; ++i) {
     int16_t test_input_block[64];
     int16_t test_temp_block[64];
-    int16_t test_output_block[64];
+    uint8_t dst[64], src[64];
 
-    // Initialize a test block with input range {-255, 255}.
+    for (int j = 0; j < 64; ++j) {
+      src[j] = rnd.Rand8() % 2 ? 255 : 0;
+      dst[j] = src[j] > 0 ? 0 : 255;
+    }
+    // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 64; ++j)
-      test_input_block[j] = rnd.Rand8() % 2 ? 255 : -256;
+      test_input_block[j] = src[j] - dst[j];
 
     const int pitch = 16;
     vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
-    vp9_short_idct8x8_c(test_temp_block, test_output_block, pitch);
+    vp9_short_idct8x8_add_c(test_temp_block, dst, 8);
 
     for (int j = 0; j < 64; ++j) {
-      const int diff = test_input_block[j] - test_output_block[j];
+      const int diff = dst[j] - src[j];
       const int error = diff * diff;
       if (max_error < error)
         max_error = error;
--- a/test/i420_video_source.h
+++ b/test/i420_video_source.h
@@ -83,7 +83,7 @@
   void SetSize(unsigned int width, unsigned int height) {
     if (width != width_ || height != height_) {
       vpx_img_free(img_);
-      img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_VPXI420, width, height, 1);
+      img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_I420, width, height, 1);
       ASSERT_TRUE(img_ != NULL);
       width_ = width;
       height_ = height;
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -112,20 +112,23 @@
   const int count_test_block = 10000;
   for (int i = 0; i < count_test_block; ++i) {
     int16_t input[64], coeff[64];
-    int16_t output_c[64];
     double output_r[64];
+    uint8_t dst[64], src[64];
 
+    for (int j = 0; j < 64; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 64; ++j)
-      input[j] = rnd.Rand8() - rnd.Rand8();
+      input[j] = src[j] - dst[j];
 
-    const int pitch = 16;
     reference_dct_2d(input, output_r);
     for (int j = 0; j < 64; ++j)
       coeff[j] = round(output_r[j]);
-    vp9_short_idct8x8_c(coeff, output_c, pitch);
+    vp9_short_idct8x8_add_c(coeff, dst, 8);
     for (int j = 0; j < 64; ++j) {
-      const int diff = output_c[j] -input[j];
+      const int diff = dst[j] - src[j];
       const int error = diff * diff;
       EXPECT_GE(1, error)
           << "Error: 8x8 FDCT/IDCT has error " << error
--- a/test/superframe_test.cc
+++ b/test/superframe_test.cc
@@ -30,7 +30,7 @@
   }
 
   virtual void TearDown() {
-    delete modified_buf_;
+    delete[] modified_buf_;
   }
 
   virtual bool Continue() const {
@@ -59,7 +59,7 @@
         buffer[pkt->data.frame.sz - index_sz] == marker) {
       // frame is a superframe. strip off the index.
       if (modified_buf_)
-        delete modified_buf_;
+        delete[] modified_buf_;
       modified_buf_ = new uint8_t[pkt->data.frame.sz - index_sz];
       memcpy(modified_buf_, pkt->data.frame.buf,
              pkt->data.frame.sz - index_sz);
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -1,4 +1,5 @@
 d5dfb0151c9051f8c85999255645d7a23916d3c0  hantro_collage_w352h288.yuv
+b87815bf86020c592ccc7a846ba2e28ec8043902  hantro_odd.yuv
 5184c46ddca8b1fadd16742e8500115bc8f749da  vp80-00-comprehensive-001.ivf
 65bf1bbbced81b97bd030f376d1b7f61a224793f  vp80-00-comprehensive-002.ivf
 906b4c1e99eb734504c504b3f1ad8052137ce672  vp80-00-comprehensive-003.ivf
@@ -120,4 +121,4 @@
 41d70bb5fa45bc88da1604a0af466930b8dd77b5  vp80-05-sharpness-1438.ivf.md5
 086c56378df81b6cee264d7540a7b8f2b405c7a4  vp80-05-sharpness-1439.ivf.md5
 d32dc2c4165eb266ea4c23c14a45459b363def32  vp80-05-sharpness-1440.ivf.md5
-8c69dc3d8e563f56ffab5ad1e400d9e689dd23df  vp80-05-sharpness-1443.ivf.md5
\ No newline at end of file
+8c69dc3d8e563f56ffab5ad1e400d9e689dd23df  vp80-05-sharpness-1443.ivf.md5
--- a/test/test.mk
+++ b/test/test.mk
@@ -22,6 +22,7 @@
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc
 
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../md5_utils.h ../md5_utils.c
@@ -92,6 +93,7 @@
 ## TEST DATA
 ##
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv
 
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -56,7 +56,13 @@
 
   void UpdateMD5(::libvpx_test::Decoder *dec, const vpx_codec_cx_pkt_t *pkt,
                  ::libvpx_test::MD5 *md5) {
-    dec->DecodeFrame((uint8_t *) pkt->data.frame.buf, pkt->data.frame.sz);
+    const vpx_codec_err_t res =
+        dec->DecodeFrame(reinterpret_cast<uint8_t*>(pkt->data.frame.buf),
+                         pkt->data.frame.sz);
+    if (res != VPX_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(VPX_CODEC_OK, res);
+    }
     const vpx_image_t *img = dec->GetDxData().Next();
     md5->Add(img);
   }
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -188,11 +188,11 @@
 #endif
 
 #if HAVE_SSE2
-const vp9_variance_fn_t variance4x4_wmt = vp9_variance4x4_wmt;
-const vp9_variance_fn_t variance8x8_wmt = vp9_variance8x8_wmt;
-const vp9_variance_fn_t variance8x16_wmt = vp9_variance8x16_wmt;
-const vp9_variance_fn_t variance16x8_wmt = vp9_variance16x8_wmt;
-const vp9_variance_fn_t variance16x16_wmt = vp9_variance16x16_wmt;
+const vp9_variance_fn_t variance4x4_wmt = vp9_variance4x4_sse2;
+const vp9_variance_fn_t variance8x8_wmt = vp9_variance8x8_sse2;
+const vp9_variance_fn_t variance8x16_wmt = vp9_variance8x16_sse2;
+const vp9_variance_fn_t variance16x8_wmt = vp9_variance16x8_sse2;
+const vp9_variance_fn_t variance16x16_wmt = vp9_variance16x16_sse2;
 INSTANTIATE_TEST_CASE_P(
     SSE2, VP9VarianceTest,
     ::testing::Values(make_tuple(4, 4, variance4x4_wmt),
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -103,7 +103,7 @@
     if (width != width_ || height != height_) {
       vpx_img_free(img_);
       raw_sz_ = ((width + 31)&~31) * height * 3 / 2;
-      img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_VPXI420, width, height, 32);
+      img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_I420, width, height, 32);
       width_ = width;
       height_ = height;
     }
--- a/test/vp9_boolcoder_test.cc
+++ b/test/vp9_boolcoder_test.cc
@@ -52,7 +52,7 @@
         const int random_seed = 6432;
         const int buffer_size = 10000;
         ACMRandom bit_rnd(random_seed);
-        BOOL_CODER bw;
+        vp9_writer bw;
         uint8_t bw_buffer[buffer_size];
         vp9_start_encode(&bw, bw_buffer);
 
@@ -63,13 +63,16 @@
           } else if (bit_method == 3) {
             bit = bit_rnd(2);
           }
-          encode_bool(&bw, bit, static_cast<int>(probas[i]));
+          vp9_write(&bw, bit, static_cast<int>(probas[i]));
         }
 
         vp9_stop_encode(&bw);
 
-        BOOL_DECODER br;
-        vp9_start_decode(&br, bw_buffer, buffer_size);
+        // First bit should be zero
+        GTEST_ASSERT_EQ(bw_buffer[0] & 0x80, 0);
+
+        vp9_reader br;
+        vp9_reader_init(&br, bw_buffer, buffer_size);
         bit_rnd.Reset(random_seed);
         for (int i = 0; i < bits_to_test; ++i) {
           if (bit_method == 2) {
@@ -77,7 +80,7 @@
           } else if (bit_method == 3) {
             bit = bit_rnd(2);
           }
-          GTEST_ASSERT_EQ(decode_bool(&br, probas[i]), bit)
+          GTEST_ASSERT_EQ(vp9_read(&br, probas[i]), bit)
               << "pos: " << i << " / " << bits_to_test
               << " bit_method: " << bit_method
               << " method: " << method;
--- a/third_party/libyuv/source/scale.c
+++ b/third_party/libyuv/source/scale.c
@@ -632,7 +632,7 @@
   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
 #endif
 
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
+#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) && defined(_MSC_VER)
 
 #define HAS_SCALEROWDOWN2_SSE2
 // Reads 32 pixels, throws half away and writes 16 pixels.
--- a/tools/cpplint.py
+++ b/tools/cpplint.py
@@ -53,12 +53,8 @@
 #  - Check for 0 in char context (should be '\0')
 #  - Check for camel-case method name conventions for methods
 #    that are not simple inline getters and setters
-#  - Check that base classes have virtual destructors
-#    put "  // namespace" after } that closes a namespace, with
-#    namespace's name after 'namespace' if it is named.
 #  - Do not indent namespace contents
 #  - Avoid inlining non-trivial constructors in header files
-#    include base/basictypes.h if DISALLOW_EVIL_CONSTRUCTORS is used
 #  - Check for old-school (void) cast for call-sites of functions
 #    ignored return value
 #  - Check gUnit usage of anonymous namespace
@@ -80,6 +76,7 @@
 """
 
 import codecs
+import copy
 import getopt
 import math  # for log
 import os
@@ -139,6 +136,22 @@
       the top-level categories like 'build' and 'whitespace' will
       also be printed. If 'detailed' is provided, then a count
       is provided for each category like 'build/class'.
+
+    root=subdir
+      The root directory used for deriving header guard CPP variable.
+      By default, the header guard CPP variable is calculated as the relative
+      path to the directory that contains .git, .hg, or .svn.  When this flag
+      is specified, the relative path is calculated from the specified
+      directory. If the specified directory does not exist, this flag is
+      ignored.
+
+      Examples:
+        Assuing that src/.git exists, the header guard CPP variables for
+        src/chrome/browser/ui/browser.h are:
+
+        No flag => CHROME_BROWSER_UI_BROWSER_H_
+        --root=chrome => BROWSER_UI_BROWSER_H_
+        --root=chrome/browser => UI_BROWSER_H_
 """
 
 # We categorize each error message we print.  Here are the categories.
@@ -161,6 +174,7 @@
   'build/printf_format',
   'build/storage_class',
   'legal/copyright',
+  'readability/alt_tokens',
   'readability/braces',
   'readability/casting',
   'readability/check',
@@ -169,6 +183,7 @@
   'readability/function',
   'readability/multiline_comment',
   'readability/multiline_string',
+  'readability/namespace',
   'readability/nolint',
   'readability/streams',
   'readability/todo',
@@ -189,13 +204,14 @@
   'runtime/sizeof',
   'runtime/string',
   'runtime/threadsafe_fn',
-  'runtime/virtual',
   'whitespace/blank_line',
   'whitespace/braces',
   'whitespace/comma',
   'whitespace/comments',
+  'whitespace/empty_loop_body',
   'whitespace/end_of_line',
   'whitespace/ending_newline',
+  'whitespace/forcolon',
   'whitespace/indent',
   'whitespace/labels',
   'whitespace/line_length',
@@ -278,7 +294,35 @@
   _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement
   _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement
 
+# Alternative tokens and their replacements.  For full list, see section 2.5
+# Alternative tokens [lex.digraph] in the C++ standard.
+#
+# Digraphs (such as '%:') are not included here since it's a mess to
+# match those on a word boundary.
+_ALT_TOKEN_REPLACEMENT = {
+    'and': '&&',
+    'bitor': '|',
+    'or': '||',
+    'xor': '^',
+    'compl': '~',
+    'bitand': '&',
+    'and_eq': '&=',
+    'or_eq': '|=',
+    'xor_eq': '^=',
+    'not': '!',
+    'not_eq': '!='
+    }
 
+# Compile regular expression that matches all the above keywords.  The "[ =()]"
+# bit is meant to avoid matching these keywords outside of boolean expressions.
+#
+# False positives include C-style multi-line comments (http://go/nsiut )
+# and multi-line strings (http://go/beujw ), but those have always been
+# troublesome for cpplint.
+_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile(
+    r'[ =()](' + ('|'.join(_ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)')
+
+
 # These constants define types of headers for use with
 # _IncludeState.CheckNextIncludeOrder().
 _C_SYS_HEADER = 1
@@ -287,7 +331,18 @@
 _POSSIBLE_MY_HEADER = 4
 _OTHER_HEADER = 5
 
+# These constants define the current inline assembly state
+_NO_ASM = 0       # Outside of inline assembly block
+_INSIDE_ASM = 1   # Inside inline assembly block
+_END_ASM = 2      # Last line of inline assembly block
+_BLOCK_ASM = 3    # The whole block is an inline assembly block
 
+# Match start of assembly blocks
+_MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)'
+                        r'(?:\s+(volatile|__volatile__))?'
+                        r'\s*[{(]')
+
+
 _regexp_compile_cache = {}
 
 # Finds occurrences of NOLINT or NOLINT(...).
@@ -297,6 +352,10 @@
 # on which those errors are expected and should be suppressed.
 _error_suppressions = {}
 
+# The root directory used for deriving header guard CPP variable.
+# This is set by --root flag.
+_root = None
+
 def ParseNolintSuppressions(filename, raw_line, linenum, error):
   """Updates the global list of error-suppressions.
 
@@ -925,7 +984,7 @@
 
   1) elided member contains lines without strings and comments,
   2) lines member contains lines without comments, and
-  3) raw member contains all the lines without processing.
+  3) raw_lines member contains all the lines without processing.
   All these three members are of <type 'list'>, and of the same length.
   """
 
@@ -965,6 +1024,29 @@
     return elided
 
 
+def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar):
+  """Find the position just after the matching endchar.
+
+  Args:
+    line: a CleansedLines line.
+    startpos: start searching at this position.
+    depth: nesting level at startpos.
+    startchar: expression opening character.
+    endchar: expression closing character.
+
+  Returns:
+    Index just after endchar.
+  """
+  for i in xrange(startpos, len(line)):
+    if line[i] == startchar:
+      depth += 1
+    elif line[i] == endchar:
+      depth -= 1
+      if depth == 0:
+        return i + 1
+  return -1
+
+
 def CloseExpression(clean_lines, linenum, pos):
   """If input points to ( or { or [, finds the position that closes it.
 
@@ -991,18 +1073,23 @@
   if startchar == '[': endchar = ']'
   if startchar == '{': endchar = '}'
 
-  num_open = line.count(startchar) - line.count(endchar)
-  while linenum < clean_lines.NumLines() and num_open > 0:
+  # Check first line
+  end_pos = FindEndOfExpressionInLine(line, pos, 0, startchar, endchar)
+  if end_pos > -1:
+    return (line, linenum, end_pos)
+  tail = line[pos:]
+  num_open = tail.count(startchar) - tail.count(endchar)
+  while linenum < clean_lines.NumLines() - 1:
     linenum += 1
     line = clean_lines.elided[linenum]
-    num_open += line.count(startchar) - line.count(endchar)
-  # OK, now find the endchar that actually got us back to even
-  endpos = len(line)
-  while num_open >= 0:
-    endpos = line.rfind(')', 0, endpos)
-    num_open -= 1                 # chopped off another )
-  return (line, linenum, endpos + 1)
+    delta = line.count(startchar) - line.count(endchar)
+    if num_open + delta <= 0:
+      return (line, linenum,
+              FindEndOfExpressionInLine(line, 0, num_open, startchar, endchar))
+    num_open += delta
 
+  # Did not find endchar before end of file, give up
+  return (line, clean_lines.NumLines(), -1)
 
 def CheckForCopyright(filename, lines, error):
   """Logs an error if no Copyright message appears at the top of the file."""
@@ -1032,9 +1119,13 @@
   # Restores original filename in case that cpplint is invoked from Emacs's
   # flymake.
   filename = re.sub(r'_flymake\.h$', '.h', filename)
+  filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename)
 
   fileinfo = FileInfo(filename)
-  return re.sub(r'[-./\s]', '_', fileinfo.RepositoryName()).upper() + '_'
+  file_path_from_root = fileinfo.RepositoryName()
+  if _root:
+    file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root)
+  return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
 
 
 def CheckForHeaderGuard(filename, lines, error):
@@ -1259,17 +1350,55 @@
           'Changing pointer instead of value (or unused value of operator*).')
 
 
-class _ClassInfo(object):
+class _BlockInfo(object):
+  """Stores information about a generic block of code."""
+
+  def __init__(self, seen_open_brace):
+    self.seen_open_brace = seen_open_brace
+    self.open_parentheses = 0
+    self.inline_asm = _NO_ASM
+
+  def CheckBegin(self, filename, clean_lines, linenum, error):
+    """Run checks that applies to text up to the opening brace.
+
+    This is mostly for checking the text after the class identifier
+    and the "{", usually where the base class is specified.  For other
+    blocks, there isn't much to check, so we always pass.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    pass
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    """Run checks that applies to text after the closing brace.
+
+    This is mostly used for checking end of namespace comments.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    pass
+
+
+class _ClassInfo(_BlockInfo):
   """Stores information about a class."""
 
-  def __init__(self, name, clean_lines, linenum):
+  def __init__(self, name, class_or_struct, clean_lines, linenum):
+    _BlockInfo.__init__(self, False)
     self.name = name
-    self.linenum = linenum
-    self.seen_open_brace = False
+    self.starting_linenum = linenum
     self.is_derived = False
-    self.virtual_method_linenumber = None
-    self.has_virtual_destructor = False
-    self.brace_depth = 0
+    if class_or_struct == 'struct':
+      self.access = 'public'
+    else:
+      self.access = 'private'
 
     # Try to find the end of the class.  This will be confused by things like:
     #   class A {
@@ -1279,26 +1408,324 @@
     self.last_line = 0
     depth = 0
     for i in range(linenum, clean_lines.NumLines()):
-      line = clean_lines.lines[i]
+      line = clean_lines.elided[i]
       depth += line.count('{') - line.count('}')
       if not depth:
         self.last_line = i
         break
 
+  def CheckBegin(self, filename, clean_lines, linenum, error):
+    # Look for a bare ':'
+    if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]):
+      self.is_derived = True
 
-class _ClassState(object):
-  """Holds the current state of the parse relating to class declarations.
 
-  It maintains a stack of _ClassInfos representing the parser's guess
-  as to the current nesting of class declarations. The innermost class
-  is at the top (back) of the stack. Typically, the stack will either
-  be empty or have exactly one entry.
-  """
+class _NamespaceInfo(_BlockInfo):
+  """Stores information about a namespace."""
 
+  def __init__(self, name, linenum):
+    _BlockInfo.__init__(self, False)
+    self.name = name or ''
+    self.starting_linenum = linenum
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    """Check end of namespace comments."""
+    line = clean_lines.raw_lines[linenum]
+
+    # Check how many lines is enclosed in this namespace.  Don't issue
+    # warning for missing namespace comments if there aren't enough
+    # lines.  However, do apply checks if there is already an end of
+    # namespace comment and it's incorrect.
+    #
+    # TODO(unknown): We always want to check end of namespace comments
+    # if a namespace is large, but sometimes we also want to apply the
+    # check if a short namespace contained nontrivial things (something
+    # other than forward declarations).  There is currently no logic on
+    # deciding what these nontrivial things are, so this check is
+    # triggered by namespace size only, which works most of the time.
+    if (linenum - self.starting_linenum < 10
+        and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)):
+      return
+
+    # Look for matching comment at end of namespace.
+    #
+    # Note that we accept C style "/* */" comments for terminating
+    # namespaces, so that code that terminate namespaces inside
+    # preprocessor macros can be cpplint clean.  Example: http://go/nxpiz
+    #
+    # We also accept stuff like "// end of namespace <name>." with the
+    # period at the end.
+    #
+    # Besides these, we don't accept anything else, otherwise we might
+    # get false negatives when existing comment is a substring of the
+    # expected namespace.  Example: http://go/ldkdc, http://cl/23548205
+    if self.name:
+      # Named namespace
+      if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) +
+                    r'[\*/\.\\\s]*$'),
+                   line):
+        error(filename, linenum, 'readability/namespace', 5,
+              'Namespace should be terminated with "// namespace %s"' %
+              self.name)
+    else:
+      # Anonymous namespace
+      if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
+        error(filename, linenum, 'readability/namespace', 5,
+              'Namespace should be terminated with "// namespace"')
+
+
+class _PreprocessorInfo(object):
+  """Stores checkpoints of nesting stacks when #if/#else is seen."""
+
+  def __init__(self, stack_before_if):
+    # The entire nesting stack before #if
+    self.stack_before_if = stack_before_if
+
+    # The entire nesting stack up to #else
+    self.stack_before_else = []
+
+    # Whether we have already seen #else or #elif
+    self.seen_else = False
+
+
+class _NestingState(object):
+  """Holds states related to parsing braces."""
+
   def __init__(self):
-    self.classinfo_stack = []
+    # Stack for tracking all braces.  An object is pushed whenever we
+    # see a "{", and popped when we see a "}".  Only 3 types of
+    # objects are possible:
+    # - _ClassInfo: a class or struct.
+    # - _NamespaceInfo: a namespace.
+    # - _BlockInfo: some other type of block.
+    self.stack = []
 
-  def CheckFinished(self, filename, error):
+    # Stack of _PreprocessorInfo objects.
+    self.pp_stack = []
+
+  def SeenOpenBrace(self):
+    """Check if we have seen the opening brace for the innermost block.
+
+    Returns:
+      True if we have seen the opening brace, False if the innermost
+      block is still expecting an opening brace.
+    """
+    return (not self.stack) or self.stack[-1].seen_open_brace
+
+  def InNamespaceBody(self):
+    """Check if we are currently one level inside a namespace body.
+
+    Returns:
+      True if top of the stack is a namespace block, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _NamespaceInfo)
+
+  def UpdatePreprocessor(self, line):
+    """Update preprocessor stack.
+
+    We need to handle preprocessors due to classes like this:
+      #ifdef SWIG
+      struct ResultDetailsPageElementExtensionPoint {
+      #else
+      struct ResultDetailsPageElementExtensionPoint : public Extension {
+      #endif
+    (see http://go/qwddn for original example)
+
+    We make the following assumptions (good enough for most files):
+    - Preprocessor condition evaluates to true from #if up to first
+      #else/#elif/#endif.
+
+    - Preprocessor condition evaluates to false from #else/#elif up
+      to #endif.  We still perform lint checks on these lines, but
+      these do not affect nesting stack.
+
+    Args:
+      line: current line to check.
+    """
+    if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line):
+      # Beginning of #if block, save the nesting stack here.  The saved
+      # stack will allow us to restore the parsing state in the #else case.
+      self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack)))
+    elif Match(r'^\s*#\s*(else|elif)\b', line):
+      # Beginning of #else block
+      if self.pp_stack:
+        if not self.pp_stack[-1].seen_else:
+          # This is the first #else or #elif block.  Remember the
+          # whole nesting stack up to this point.  This is what we
+          # keep after the #endif.
+          self.pp_stack[-1].seen_else = True
+          self.pp_stack[-1].stack_before_else = copy.deepcopy(self.stack)
+
+        # Restore the stack to how it was before the #if
+        self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if)
+      else:
+        # TODO(unknown): unexpected #else, issue warning?
+        pass
+    elif Match(r'^\s*#\s*endif\b', line):
+      # End of #if or #else blocks.
+      if self.pp_stack:
+        # If we saw an #else, we will need to restore the nesting
+        # stack to its former state before the #else, otherwise we
+        # will just continue from where we left off.
+        if self.pp_stack[-1].seen_else:
+          # Here we can just use a shallow copy since we are the last
+          # reference to it.
+          self.stack = self.pp_stack[-1].stack_before_else
+        # Drop the corresponding #if
+        self.pp_stack.pop()
+      else:
+        # TODO(unknown): unexpected #endif, issue warning?
+        pass
+
+  def Update(self, filename, clean_lines, linenum, error):
+    """Update nesting state with current line.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    line = clean_lines.elided[linenum]
+
+    # Update pp_stack first
+    self.UpdatePreprocessor(line)
+
+    # Count parentheses.  This is to avoid adding struct arguments to
+    # the nesting stack.
+    if self.stack:
+      inner_block = self.stack[-1]
+      depth_change = line.count('(') - line.count(')')
+      inner_block.open_parentheses += depth_change
+
+      # Also check if we are starting or ending an inline assembly block.
+      if inner_block.inline_asm in (_NO_ASM, _END_ASM):
+        if (depth_change != 0 and
+            inner_block.open_parentheses == 1 and
+            _MATCH_ASM.match(line)):
+          # Enter assembly block
+          inner_block.inline_asm = _INSIDE_ASM
+        else:
+          # Not entering assembly block.  If previous line was _END_ASM,
+          # we will now shift to _NO_ASM state.
+          inner_block.inline_asm = _NO_ASM
+      elif (inner_block.inline_asm == _INSIDE_ASM and
+            inner_block.open_parentheses == 0):
+        # Exit assembly block
+        inner_block.inline_asm = _END_ASM
+
+    # Consume namespace declaration at the beginning of the line.  Do
+    # this in a loop so that we catch same line declarations like this:
+    #   namespace proto2 { namespace bridge { class MessageSet; } }
+    while True:
+      # Match start of namespace.  The "\b\s*" below catches namespace
+      # declarations even if it weren't followed by a whitespace, this
+      # is so that we don't confuse our namespace checker.  The
+      # missing spaces will be flagged by CheckSpacing.
+      namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', line)
+      if not namespace_decl_match:
+        break
+
+      new_namespace = _NamespaceInfo(namespace_decl_match.group(1), linenum)
+      self.stack.append(new_namespace)
+
+      line = namespace_decl_match.group(2)
+      if line.find('{') != -1:
+        new_namespace.seen_open_brace = True
+        line = line[line.find('{') + 1:]
+
+    # Look for a class declaration in whatever is left of the line
+    # after parsing namespaces.  The regexp accounts for decorated classes
+    # such as in:
+    #   class LOCKABLE API Object {
+    #   };
+    #
+    # Templates with class arguments may confuse the parser, for example:
+    #   template <class T
+    #             class Comparator = less<T>,
+    #             class Vector = vector<T> >
+    #   class HeapQueue {
+    #
+    # Because this parser has no nesting state about templates, by the
+    # time it saw "class Comparator", it may think that it's a new class.
+    # Nested templates have a similar problem:
+    #   template <
+    #       typename ExportedType,
+    #       typename TupleType,
+    #       template <typename, typename> class ImplTemplate>
+    #
+    # To avoid these cases, we ignore classes that are followed by '=' or '>'
+    class_decl_match = Match(
+        r'\s*(template\s*<[\w\s<>,:]*>\s*)?'
+        '(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)'
+        '(([^=>]|<[^<>]*>)*)$', line)
+    if (class_decl_match and
+        (not self.stack or self.stack[-1].open_parentheses == 0)):
+      self.stack.append(_ClassInfo(
+          class_decl_match.group(4), class_decl_match.group(2),
+          clean_lines, linenum))
+      line = class_decl_match.group(5)
+
+    # If we have not yet seen the opening brace for the innermost block,
+    # run checks here.
+    if not self.SeenOpenBrace():
+      self.stack[-1].CheckBegin(filename, clean_lines, linenum, error)
+
+    # Update access control if we are inside a class/struct
+    if self.stack and isinstance(self.stack[-1], _ClassInfo):
+      access_match = Match(r'\s*(public|private|protected)\s*:', line)
+      if access_match:
+        self.stack[-1].access = access_match.group(1)
+
+    # Consume braces or semicolons from what's left of the line
+    while True:
+      # Match first brace, semicolon, or closed parenthesis.
+      matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line)
+      if not matched:
+        break
+
+      token = matched.group(1)
+      if token == '{':
+        # If namespace or class hasn't seen a opening brace yet, mark
+        # namespace/class head as complete.  Push a new block onto the
+        # stack otherwise.
+        if not self.SeenOpenBrace():
+          self.stack[-1].seen_open_brace = True
+        else:
+          self.stack.append(_BlockInfo(True))
+          if _MATCH_ASM.match(line):
+            self.stack[-1].inline_asm = _BLOCK_ASM
+      elif token == ';' or token == ')':
+        # If we haven't seen an opening brace yet, but we already saw
+        # a semicolon, this is probably a forward declaration.  Pop
+        # the stack for these.
+        #
+        # Similarly, if we haven't seen an opening brace yet, but we
+        # already saw a closing parenthesis, then these are probably
+        # function arguments with extra "class" or "struct" keywords.
+        # Also pop these stack for these.
+        if not self.SeenOpenBrace():
+          self.stack.pop()
+      else:  # token == '}'
+        # Perform end of block checks and pop the stack.
+        if self.stack:
+          self.stack[-1].CheckEnd(filename, clean_lines, linenum, error)
+          self.stack.pop()
+      line = matched.group(2)
+
+  def InnermostClass(self):
+    """Get class info on the top of the stack.
+
+    Returns:
+      A _ClassInfo object if we are inside a class, or None otherwise.
+    """
+    for i in range(len(self.stack), 0, -1):
+      classinfo = self.stack[i - 1]
+      if isinstance(classinfo, _ClassInfo):
+        return classinfo
+    return None
+
+  def CheckClassFinished(self, filename, error):
     """Checks that all classes have been completely parsed.
 
     Call this when all lines in a file have been processed.
@@ -1306,17 +1733,18 @@
       filename: The name of the current file.
       error: The function to call with any errors found.
     """
-    if self.classinfo_stack:
-      # Note: This test can result in false positives if #ifdef constructs
-      # get in the way of brace matching. See the testBuildClass test in
-      # cpplint_unittest.py for an example of this.
-      error(filename, self.classinfo_stack[0].linenum, 'build/class', 5,
-            'Failed to find complete declaration of class %s' %
-            self.classinfo_stack[0].name)
+    # Note: This test can result in false positives if #ifdef constructs
+    # get in the way of brace matching. See the testBuildClass test in
+    # cpplint_unittest.py for an example of this.
+    for obj in self.stack:
+      if isinstance(obj, _ClassInfo):
+        error(filename, obj.starting_linenum, 'build/class', 5,
+              'Failed to find complete declaration of class %s' %
+              obj.name)
 
 
 def CheckForNonStandardConstructs(filename, clean_lines, linenum,
-                                  class_state, error):
+                                  nesting_state, error):
   """Logs an error if we see certain non-ANSI constructs ignored by gcc-2.
 
   Complain about several constructs which gcc-2 accepts, but which are
@@ -1329,8 +1757,6 @@
   - text after #endif is not allowed.
   - invalid inner-style forward declaration.
   - >? and <? operators, and their >?= and <?= cousins.
-  - classes with virtual methods need virtual destructors (compiler warning
-    available, but not turned on yet.)
 
   Additionally, check for constructor/destructor style violations and reference
   members, as it is very convenient to do so while checking for
@@ -1340,8 +1766,8 @@
     filename: The name of the current file.
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    class_state: A _ClassState instance which maintains information about
-                 the current stack of nested class declarations being parsed.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
     error: A callable to which errors are reported, which takes 4 arguments:
            filename, line number, error level, and message
   """
@@ -1370,7 +1796,7 @@
   if Search(r'\b(const|volatile|void|char|short|int|long'
             r'|float|double|signed|unsigned'
             r'|schar|u?int8|u?int16|u?int32|u?int64)'
-            r'\s+(auto|register|static|extern|typedef)\b',
+            r'\s+(register|static|extern|typedef)\b',
             line):
     error(filename, linenum, 'build/storage_class', 5,
           'Storage class (static, extern, typedef, etc) should be first.')
@@ -1400,45 +1826,13 @@
           'const string& members are dangerous. It is much better to use '
           'alternatives, such as pointers or simple constants.')
 
-  # Track class entry and exit, and attempt to find cases within the
-  # class declaration that don't meet the C++ style
-  # guidelines. Tracking is very dependent on the code matching Google
-  # style guidelines, but it seems to perform well enough in testing
-  # to be a worthwhile addition to the checks.
-  classinfo_stack = class_state.classinfo_stack
-  # Look for a class declaration. The regexp accounts for decorated classes
-  # such as in:
-  # class LOCKABLE API Object {
-  # };
-  class_decl_match = Match(
-      r'\s*(template\s*<[\w\s<>,:]*>\s*)?'
-      '(class|struct)\s+([A-Z_]+\s+)*(\w+(::\w+)*)', line)
-  if class_decl_match:
-    classinfo_stack.append(_ClassInfo(
-        class_decl_match.group(4), clean_lines, linenum))
-
-  # Everything else in this function uses the top of the stack if it's
-  # not empty.
-  if not classinfo_stack:
+  # Everything else in this function operates on class declarations.
+  # Return early if the top of the nesting stack is not a class, or if
+  # the class head is not completed yet.
+  classinfo = nesting_state.InnermostClass()
+  if not classinfo or not classinfo.seen_open_brace:
     return
 
-  classinfo = classinfo_stack[-1]
-
-  # If the opening brace hasn't been seen look for it and also
-  # parent class declarations.
-  if not classinfo.seen_open_brace:
-    # If the line has a ';' in it, assume it's a forward declaration or
-    # a single-line class declaration, which we won't process.
-    if line.find(';') != -1:
-      classinfo_stack.pop()
-      return
-    classinfo.seen_open_brace = (line.find('{') != -1)
-    # Look for a bare ':'
-    if Search('(^|[^:]):($|[^:])', line):
-      classinfo.is_derived = True
-    if not classinfo.seen_open_brace:
-      return  # Everything else in this function is for after open brace
-
   # The class may have been declared with namespace or classname qualifiers.
   # The constructor and destructor will not have those qualifiers.
   base_classname = classinfo.name.split('::')[-1]
@@ -1455,36 +1849,7 @@
     error(filename, linenum, 'runtime/explicit', 5,
           'Single-argument constructors should be marked explicit.')
 
-  # Look for methods declared virtual.
-  if Search(r'\bvirtual\b', line):
-    classinfo.virtual_method_linenumber = linenum
-    # Only look for a destructor declaration on the same line. It would
-    # be extremely unlikely for the destructor declaration to occupy
-    # more than one line.
-    if Search(r'~%s\s*\(' % base_classname, line):
-      classinfo.has_virtual_destructor = True
 
-  # Look for class end.
-  brace_depth = classinfo.brace_depth
-  brace_depth = brace_depth + line.count('{') - line.count('}')
-  if brace_depth <= 0:
-    classinfo = classinfo_stack.pop()
-    # Try to detect missing virtual destructor declarations.
-    # For now, only warn if a non-derived class with virtual methods lacks
-    # a virtual destructor. This is to make it less likely that people will
-    # declare derived virtual destructors without declaring the base
-    # destructor virtual.
-    if ((classinfo.virtual_method_linenumber is not None) and
-        (not classinfo.has_virtual_destructor) and
-        (not classinfo.is_derived)):  # Only warn for base classes
-      error(filename, classinfo.linenum, 'runtime/virtual', 4,
-            'The class %s probably needs a virtual destructor due to '
-            'having virtual method(s), one declared at line %d.'
-            % (classinfo.name, classinfo.virtual_method_linenumber))
-  else:
-    classinfo.brace_depth = brace_depth
-
-
 def CheckSpacingForFunctionCall(filename, line, linenum, error):
   """Checks for the correctness of various spacing around function calls.
 
@@ -1535,7 +1900,8 @@
       error(filename, linenum, 'whitespace/parens', 2,
             'Extra space after (')
     if (Search(r'\w\s+\(', fncall) and
-        not Search(r'#\s*define|typedef', fncall)):
+        not Search(r'#\s*define|typedef', fncall) and
+        not Search(r'\w\s+\((\w+::)?\*\w+\)\(', fncall)):
       error(filename, linenum, 'whitespace/parens', 4,
             'Extra space before ( in function call')
     # If the ) is followed only by a newline or a { + newline, assume it's
@@ -1668,8 +2034,165 @@
       error(filename, linenum, 'whitespace/todo', 2,
             'TODO(my_username) should be followed by a space')
 
+def CheckAccess(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for improper use of DISALLOW* macros.
 
-def CheckSpacing(filename, clean_lines, linenum, error):
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]  # get rid of comments and strings
+
+  matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|'
+                   r'DISALLOW_EVIL_CONSTRUCTORS|'
+                   r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line)
+  if not matched:
+    return
+  if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo):
+    if nesting_state.stack[-1].access != 'private':
+      error(filename, linenum, 'readability/constructors', 3,
+            '%s must be in the private: section' % matched.group(1))
+
+  else:
+    # Found DISALLOW* macro outside a class declaration, or perhaps it
+    # was used inside a function when it should have been part of the
+    # class declaration.  We could issue a warning here, but it
+    # probably resulted in a compiler error already.
+    pass
+
+
+def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix):
+  """Find the corresponding > to close a template.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: Current line number.
+    init_suffix: Remainder of the current line after the initial <.
+
+  Returns:
+    True if a matching bracket exists.
+  """
+  line = init_suffix
+  nesting_stack = ['<']
+  while True:
+    # Find the next operator that can tell us whether < is used as an
+    # opening bracket or as a less-than operator.  We only want to
+    # warn on the latter case.
+    #
+    # We could also check all other operators and terminate the search
+    # early, e.g. if we got something like this "a<b+c", the "<" is
+    # most likely a less-than operator, but then we will get false
+    # positives for default arguments (e.g. http://go/prccd) and
+    # other template expressions (e.g. http://go/oxcjq).
+    match = Search(r'^[^<>(),;\[\]]*([<>(),;\[\]])(.*)$', line)
+    if match:
+      # Found an operator, update nesting stack
+      operator = match.group(1)
+      line = match.group(2)
+
+      if nesting_stack[-1] == '<':
+        # Expecting closing angle bracket
+        if operator in ('<', '(', '['):
+          nesting_stack.append(operator)
+        elif operator == '>':
+          nesting_stack.pop()
+          if not nesting_stack:
+            # Found matching angle bracket
+            return True
+        elif operator == ',':
+          # Got a comma after a bracket, this is most likely a template
+          # argument.  We have not seen a closing angle bracket yet, but
+          # it's probably a few lines later if we look for it, so just
+          # return early here.
+          return True
+        else:
+          # Got some other operator.
+          return False
+
+      else:
+        # Expecting closing parenthesis or closing bracket
+        if operator in ('<', '(', '['):
+          nesting_stack.append(operator)
+        elif operator in (')', ']'):
+          # We don't bother checking for matching () or [].  If we got
+          # something like (] or [), it would have been a syntax error.
+          nesting_stack.pop()
+
+    else:
+      # Scan the next line
+      linenum += 1
+      if linenum >= len(clean_lines.elided):
+        break
+      line = clean_lines.elided[linenum]
+
+  # Exhausted all remaining lines and still no matching angle bracket.
+  # Most likely the input was incomplete, otherwise we should have
+  # seen a semicolon and returned early.
+  return True
+
+
+def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix):
+  """Find the corresponding < that started a template.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: Current line number.
+    init_prefix: Part of the current line before the initial >.
+
+  Returns:
+    True if a matching bracket exists.
+  """
+  line = init_prefix
+  nesting_stack = ['>']
+  while True:
+    # Find the previous operator
+    match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line)
+    if match:
+      # Found an operator, update nesting stack
+      operator = match.group(2)
+      line = match.group(1)
+
+      if nesting_stack[-1] == '>':
+        # Expecting opening angle bracket
+        if operator in ('>', ')', ']'):
+          nesting_stack.append(operator)
+        elif operator == '<':
+          nesting_stack.pop()
+          if not nesting_stack:
+            # Found matching angle bracket
+            return True
+        elif operator == ',':
+          # Got a comma before a bracket, this is most likely a
+          # template argument.  The opening angle bracket is probably
+          # there if we look for it, so just return early here.
+          return True
+        else:
+          # Got some other operator.
+          return False
+
+      else:
+        # Expecting opening parenthesis or opening bracket
+        if operator in ('>', ')', ']'):
+          nesting_stack.append(operator)
+        elif operator in ('(', '['):
+          nesting_stack.pop()
+
+    else:
+      # Scan the previous line
+      linenum -= 1
+      if linenum < 0:
+        break
+      line = clean_lines.elided[linenum]
+
+  # Exhausted all earlier lines and still no matching angle bracket.
+  return False
+
+
+def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
   """Checks for the correctness of various spacing issues in the code.
 
   Things we check for: spaces around operators, spaces after
@@ -1682,6 +2205,8 @@
     filename: The name of the current file.
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
 
@@ -1691,7 +2216,16 @@
   # Before nixing comments, check if the line is blank for no good
   # reason.  This includes the first line after a block is opened, and
   # blank lines at the end of a function (ie, right before a line like '}'
-  if IsBlankLine(line):
+  #
+  # Skip all the blank line checks if we are immediately inside a
+  # namespace body.  In other words, don't issue blank line warnings
+  # for this block:
+  #   namespace {
+  #
+  #   }
+  #
+  # A warning about missing end of namespace comments will be issued instead.
+  if IsBlankLine(line) and not nesting_state.InNamespaceBody():
     elided = clean_lines.elided
     prev_line = elided[linenum - 1]
     prevbrace = prev_line.rfind('{')
@@ -1699,8 +2233,7 @@
     #                both start with alnums and are indented the same amount.
     #                This ignores whitespace at the start of a namespace block
     #                because those are not usually indented.
-    if (prevbrace != -1 and prev_line[prevbrace:].find('}') == -1
-        and prev_line[:prevbrace].find('namespace') == -1):
+    if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1:
       # OK, we have a blank line at the start of a code block.  Before we
       # complain, we check if it is an exception to the rule: The previous
       # non-empty line has the parameters of a function header that are indented
@@ -1732,12 +2265,7 @@
       if not exception:
         error(filename, linenum, 'whitespace/blank_line', 2,
               'Blank line at the start of a code block.  Is this needed?')
-    # This doesn't ignore whitespace at the end of a namespace block
-    # because that is too hard without pairing open/close braces;
-    # however, a special exception is made for namespace closing
-    # brackets which have a comment containing "namespace".
-    #
-    # Also, ignore blank lines at the end of a block in a long if-else
+    # Ignore blank lines at the end of a block in a long if-else
     # chain, like this:
     #   if (condition1) {
     #     // Something followed by a blank line
@@ -1749,7 +2277,6 @@
       next_line = raw[linenum + 1]
       if (next_line
           and Match(r'\s*}', next_line)
-          and next_line.find('namespace') == -1
           and next_line.find('} else ') == -1):
         error(filename, linenum, 'whitespace/blank_line', 3,
               'Blank line at the end of a code block.  Is this needed?')
@@ -1810,26 +2337,59 @@
   # though, so we punt on this one for now.  TODO.
 
   # You should always have whitespace around binary operators.
-  # Alas, we can't test < or > because they're legitimately used sans spaces
-  # (a->b, vector<int> a).  The only time we can tell is a < with no >, and
-  # only if it's not template params list spilling into the next line.
+  #
+  # Check <= and >= first to avoid false positives with < and >, then
+  # check non-include lines for spacing around < and >.
   match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line)
-  if not match:
-    # Note that while it seems that the '<[^<]*' term in the following
-    # regexp could be simplified to '<.*', which would indeed match
-    # the same class of strings, the [^<] means that searching for the
-    # regexp takes linear rather than quadratic time.
-    if not Search(r'<[^<]*,\s*$', line):  # template params spill
-      match = Search(r'[^<>=!\s](<)[^<>=!\s]([^>]|->)*$', line)
   if match:
     error(filename, linenum, 'whitespace/operators', 3,
           'Missing spaces around %s' % match.group(1))
-  # We allow no-spaces around << and >> when used like this: 10<<20, but
+  # We allow no-spaces around << when used like this: 10<<20, but
   # not otherwise (particularly, not when used as streams)
-  match = Search(r'[^0-9\s](<<|>>)[^0-9\s]', line)
+  match = Search(r'(\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line)
+  if match and not (match.group(1).isdigit() and match.group(2).isdigit()):
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around <<')
+  elif not Match(r'#.*include', line):
+    # Avoid false positives on ->
+    reduced_line = line.replace('->', '')
+
+    # Look for < that is not surrounded by spaces.  This is only
+    # triggered if both sides are missing spaces, even though
+    # technically should should flag if at least one side is missing a
+    # space.  This is done to avoid some false positives with shifts.
+    match = Search(r'[^\s<]<([^\s=<].*)', reduced_line)
+    if (match and
+        not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))):
+      error(filename, linenum, 'whitespace/operators', 3,
+            'Missing spaces around <')
+
+    # Look for > that is not surrounded by spaces.  Similar to the
+    # above, we only trigger if both sides are missing spaces to avoid
+    # false positives with shifts.
+    match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line)
+    if (match and
+        not FindPreviousMatchingAngleBracket(clean_lines, linenum,
+                                             match.group(1))):
+      error(filename, linenum, 'whitespace/operators', 3,
+            'Missing spaces around >')
+
+  # We allow no-spaces around >> for almost anything.  This is because
+  # C++11 allows ">>" to close nested templates, which accounts for
+  # most cases when ">>" is not followed by a space.
+  #
+  # We still warn on ">>" followed by alpha character, because that is
+  # likely due to ">>" being used for right shifts, e.g.:
+  #   value >> alpha
+  #
+  # When ">>" is used to close templates, the alphanumeric letter that
+  # follows would be part of an identifier, and there should still be
+  # a space separating the template type and the identifier.
+  #   type<type<type>> alpha
+  match = Search(r'>>[a-zA-Z_]', line)
   if match:
     error(filename, linenum, 'whitespace/operators', 3,
-          'Missing spaces around %s' % match.group(1))
+          'Missing spaces around >>')
 
   # There shouldn't be space around unary operators
   match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line)
@@ -1903,18 +2463,25 @@
   # the semicolon there.
   if Search(r':\s*;\s*$', line):
     error(filename, linenum, 'whitespace/semicolon', 5,
-          'Semicolon defining empty statement. Use { } instead.')
+          'Semicolon defining empty statement. Use {} instead.')
   elif Search(r'^\s*;\s*$', line):
     error(filename, linenum, 'whitespace/semicolon', 5,
           'Line contains only semicolon. If this should be an empty statement, '
-          'use { } instead.')
+          'use {} instead.')
   elif (Search(r'\s+;\s*$', line) and
         not Search(r'\bfor\b', line)):
     error(filename, linenum, 'whitespace/semicolon', 5,
           'Extra space before last semicolon. If this should be an empty '
-          'statement, use { } instead.')
+          'statement, use {} instead.')
 
+  # In range-based for, we wanted spaces before and after the colon, but
+  # not around "::" tokens that might appear.
+  if (Search('for *\(.*[^:]:[^: ]', line) or
+      Search('for *\(.*[^: ]:[^:]', line)):
+    error(filename, linenum, 'whitespace/forcolon', 2,
+          'Missing space around colon in range-based for loop')
 
+
 def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):
   """Checks for additional blank line issues related to sections.
 
@@ -1938,8 +2505,8 @@
   #
   # If we didn't find the end of the class, last_line would be zero,
   # and the check will be skipped by the first condition.
-  if (class_info.last_line - class_info.linenum <= 24 or
-      linenum <= class_info.linenum):
+  if (class_info.last_line - class_info.starting_linenum <= 24 or
+      linenum <= class_info.starting_linenum):
     return
 
   matched = Match(r'\s*(public|protected|private):', clean_lines.lines[linenum])
@@ -1950,15 +2517,18 @@
     #  - We are at the beginning of the class.
     #  - We are forward-declaring an inner class that is semantically
     #    private, but needed to be public for implementation reasons.
+    # Also ignores cases where the previous line ends with a backslash as can be
+    # common when defining classes in C macros.
     prev_line = clean_lines.lines[linenum - 1]
     if (not IsBlankLine(prev_line) and
-        not Search(r'\b(class|struct)\b', prev_line)):
+        not Search(r'\b(class|struct)\b', prev_line) and
+        not Search(r'\\$', prev_line)):
       # Try a bit harder to find the beginning of the class.  This is to
       # account for multi-line base-specifier lists, e.g.:
       #   class Derived
       #       : public Base {
-      end_class_head = class_info.linenum
-      for i in range(class_info.linenum, linenum):
+      end_class_head = class_info.starting_linenum
+      for i in range(class_info.starting_linenum, linenum):
         if Search(r'\{\s*$', clean_lines.lines[i]):
           end_class_head = i
           break
@@ -2008,9 +2578,11 @@
     # which is commonly used to control the lifetime of
     # stack-allocated variables.  We don't detect this perfectly: we
     # just don't complain if the last non-whitespace character on the
-    # previous non-blank line is ';', ':', '{', or '}'.
+    # previous non-blank line is ';', ':', '{', or '}', or if the previous
+    # line starts a preprocessor block.
     prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
-    if not Search(r'[;:}{]\s*$', prevline):
+    if (not Search(r'[;:}{]\s*$', prevline) and
+        not Match(r'\s*#', prevline)):
       error(filename, linenum, 'whitespace/braces', 4,
             '{ should almost always be at the end of the previous line')
 
@@ -2064,6 +2636,33 @@
           "You don't need a ; after a }")
 
 
+def CheckEmptyLoopBody(filename, clean_lines, linenum, error):
+  """Loop for empty loop body with only a single semicolon.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Search for loop keywords at the beginning of the line.  Because only
+  # whitespaces are allowed before the keywords, this will also ignore most
+  # do-while-loops, since those lines should start with closing brace.
+  line = clean_lines.elided[linenum]
+  if Match(r'\s*(for|while)\s*\(', line):
+    # Find the end of the conditional expression
+    (end_line, end_linenum, end_pos) = CloseExpression(
+        clean_lines, linenum, line.find('('))
+
+    # Output warning if what follows the condition expression is a semicolon.
+    # No warning for all other cases, including whitespace or newline, since we
+    # have a separate check for semicolons preceded by whitespace.
+    if end_pos >= 0 and Match(r';', end_line[end_pos:]):
+      error(filename, end_linenum, 'whitespace/empty_loop_body', 5,
+            'Empty loop bodies should use {} or continue')
+
+
 def ReplaceableCheck(operator, macro, line):
   """Determine whether a basic CHECK can be replaced with a more specific one.
 
@@ -2132,6 +2731,38 @@
       break
 
 
+def CheckAltTokens(filename, clean_lines, linenum, error):
+  """Check alternative keywords being used in boolean expressions.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Avoid preprocessor lines
+  if Match(r'^\s*#', line):
+    return
+
+  # Last ditch effort to avoid multi-line comments.  This will not help
+  # if the comment started before the current line or ended after the
+  # current line, but it catches most of the false positives.  At least,
+  # it provides a way to workaround this warning for people who use
+  # multi-line comments in preprocessor macros.
+  #
+  # TODO(unknown): remove this once cpplint has better support for
+  # multi-line comments.
+  if line.find('/*') >= 0 or line.find('*/') >= 0:
+    return
+
+  for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line):
+    error(filename, linenum, 'readability/alt_tokens', 2,
+          'Use operator %s instead of %s' % (
+              _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1)))
+
+
 def GetLineWidth(line):
   """Determines the width of the line in column positions.
 
@@ -2154,7 +2785,7 @@
     return len(line)
 
 
-def CheckStyle(filename, clean_lines, linenum, file_extension, class_state,
+def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
                error):
   """Checks rules from the 'C++ style rules' section of cppguide.html.
 
@@ -2167,6 +2798,8 @@
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
     file_extension: The extension (without the dot) of the filename.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
 
@@ -2248,16 +2881,19 @@
       not ((cleansed_line.find('case ') != -1 or
             cleansed_line.find('default:') != -1) and
            cleansed_line.find('break;') != -1)):
-    error(filename, linenum, 'whitespace/newline', 4,
+    error(filename, linenum, 'whitespace/newline', 0,
           'More than one command on the same line')
 
   # Some more style checks
   CheckBraces(filename, clean_lines, linenum, error)
-  CheckSpacing(filename, clean_lines, linenum, error)
+  CheckEmptyLoopBody(filename, clean_lines, linenum, error)
+  CheckAccess(filename, clean_lines, linenum, nesting_state, error)
+  CheckSpacing(filename, clean_lines, linenum, nesting_state, error)
   CheckCheck(filename, clean_lines, linenum, error)
-  if class_state and class_state.classinfo_stack:
-    CheckSectionSpacing(filename, clean_lines,
-                        class_state.classinfo_stack[-1], linenum, error)
+  CheckAltTokens(filename, clean_lines, linenum, error)
+  classinfo = nesting_state.InnermostClass()
+  if classinfo:
+    CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)
 
 
 _RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"')
@@ -2554,9 +3190,11 @@
                      fnline))):
 
     # We allow non-const references in a few standard places, like functions
-    # called "swap()" or iostream operators like "<<" or ">>".
+    # called "swap()" or iostream operators like "<<" or ">>". We also filter
+    # out for loops, which lint otherwise mistakenly thinks are functions.
     if not Search(
-        r'(swap|Swap|operator[<>][<>])\s*\(\s*(?:[\w:]|<.*>)+\s*&',
+        r'(for|swap|Swap|operator[<>][<>])\s*\(\s*'
+        r'(?:(?:typename\s*)?[\w:]|<.*>)+\s*&',
         fnline):
       error(filename, linenum, 'runtime/references', 2,
             'Is this a non-const reference? '
@@ -2578,10 +3216,19 @@
     if (match.group(1) is None and  # If new operator, then this isn't a cast
         not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
              Match(r'^\s*MockCallback<.*>', line))):
-      error(filename, linenum, 'readability/casting', 4,
-            'Using deprecated casting style.  '
-            'Use static_cast<%s>(...) instead' %
-            match.group(2))
+      # Try a bit harder to catch gmock lines: the only place where
+      # something looks like an old-style cast is where we declare the
+      # return type of the mocked method, and the only time when we
+      # are missing context is if MOCK_METHOD was split across
+      # multiple lines (for example http://go/hrfhr ), so we only need
+      # to check the previous line for MOCK_METHOD.
+      if (linenum == 0 or
+          not Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(\S+,\s*$',
+                    clean_lines.elided[linenum - 1])):
+        error(filename, linenum, 'readability/casting', 4,
+              'Using deprecated casting style.  '
+              'Use static_cast<%s>(...) instead' %
+              match.group(2))
 
   CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
                   'static_cast',
@@ -2703,7 +3350,7 @@
   printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(')
   if printf_args:
     match = Match(r'([\w.\->()]+)$', printf_args)
-    if match:
+    if match and match.group(1) != '__VA_ARGS__':
       function_name = re.search(r'\b((?:string)?printf)\s*\(',
                                 line, re.I).group(1)
       error(filename, linenum, 'runtime/printf', 4,
@@ -2824,6 +3471,11 @@
           'Using sizeof(type).  Use sizeof(varname) instead if possible')
     return True
 
+  # operator++(int) and operator--(int)
+  if (line[0:match.start(1) - 1].endswith(' operator++') or
+      line[0:match.start(1) - 1].endswith(' operator--')):
+    return False
+
   remainder = line[match.end(0):]
 
   # The close paren is for function pointers as arguments to a function.
@@ -3112,13 +3764,13 @@
   if match:
     error(filename, linenum, 'build/explicit_make_pair',
           4,  # 4 = high confidence
-          'Omit template arguments from make_pair OR use pair directly OR'
-          ' if appropriate, construct a pair directly')
+          'For C++11-compatibility, omit template arguments from make_pair'
+          ' OR use pair directly OR if appropriate, construct a pair directly')
 
 
-def ProcessLine(filename, file_extension,
-                clean_lines, line, include_state, function_state,
-                class_state, error, extra_check_functions=[]):
+def ProcessLine(filename, file_extension, clean_lines, line,
+                include_state, function_state, nesting_state, error,
+                extra_check_functions=[]):
   """Processes a single line in the file.
 
   Args:
@@ -3129,8 +3781,8 @@
     line: Number of line being processed.
     include_state: An _IncludeState instance in which the headers are inserted.
     function_state: A _FunctionState instance which counts function lines, etc.
-    class_state: A _ClassState instance which maintains information about
-                 the current stack of nested class declarations being parsed.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
     error: A callable to which errors are reported, which takes 4 arguments:
            filename, line number, error level, and message
     extra_check_functions: An array of additional check functions that will be
@@ -3139,13 +3791,16 @@
   """
   raw_lines = clean_lines.raw_lines
   ParseNolintSuppressions(filename, raw_lines[line], line, error)
+  nesting_state.Update(filename, clean_lines, line, error)
+  if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM:
+    return
   CheckForFunctionLengths(filename, clean_lines, line, function_state, error)
   CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)
-  CheckStyle(filename, clean_lines, line, file_extension, class_state, error)
+  CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error)
   CheckLanguage(filename, clean_lines, line, file_extension, include_state,
                 error)
   CheckForNonStandardConstructs(filename, clean_lines, line,
-                                class_state, error)
+                                nesting_state, error)
   CheckPosixThreading(filename, clean_lines, line, error)
   CheckInvalidIncrement(filename, clean_lines, line, error)
   CheckMakePairUsesDeduction(filename, clean_lines, line, error)
@@ -3172,7 +3827,7 @@
 
   include_state = _IncludeState()
   function_state = _FunctionState()
-  class_state = _ClassState()
+  nesting_state = _NestingState()
 
   ResetNolintSuppressions()
 
@@ -3185,9 +3840,9 @@
   clean_lines = CleansedLines(lines)
   for line in xrange(clean_lines.NumLines()):
     ProcessLine(filename, file_extension, clean_lines, line,
-                include_state, function_state, class_state, error,
+                include_state, function_state, nesting_state, error,
                 extra_check_functions)
-  class_state.CheckFinished(filename, error)
+  nesting_state.CheckClassFinished(filename, error)
 
   CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)
 
@@ -3301,7 +3956,8 @@
   try:
     (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=',
                                                  'counting=',
-                                                 'filter='])
+                                                 'filter=',
+                                                 'root='])
   except getopt.GetoptError:
     PrintUsage('Invalid arguments.')
 
@@ -3327,6 +3983,9 @@
       if val not in ('total', 'toplevel', 'detailed'):
         PrintUsage('Valid counting options are total, toplevel, and detailed')
       counting_style = val
+    elif opt == '--root':
+      global _root
+      _root = val
 
   if not filenames:
     PrintUsage('No files were specified.')
--- a/vp8/encoder/arm/neon/shortfdct_neon.asm
+++ b/vp8/encoder/arm/neon/shortfdct_neon.asm
@@ -97,7 +97,7 @@
     vmlal.s16       q11, d6, d17    ; c1*2217 + d1*5352 + 12000
     vmlsl.s16       q12, d6, d16    ; d1*2217 - c1*5352 + 51000
 
-    vmvn.s16        d4, d4
+    vmvn            d4, d4
     vshrn.s32       d1, q11, #16    ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
     vsub.s16        d1, d1, d4      ; op[4] += (d1!=0)
     vshrn.s32       d3, q12, #16    ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
@@ -200,7 +200,7 @@
     vmlal.s16       q11, d27, d17   ; B[4]  = c1*2217 + d1*5352 + 12000
     vmlsl.s16       q12, d27, d16   ; B[12] = d1*2217 - c1*5352 + 51000
 
-    vmvn.s16        q14, q14
+    vmvn            q14, q14
 
     vshrn.s32       d1, q9, #16     ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
     vshrn.s32       d3, q10, #16    ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -2755,7 +2755,7 @@
     /* Clear the alternate reference update pending flag. */
     cpi->source_alt_ref_pending = 0;
 
-    /* Set the alternate refernce frame active flag */
+    /* Set the alternate reference frame active flag */
     cpi->source_alt_ref_active = 1;
 
 
@@ -3402,7 +3402,7 @@
     else
         cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 0;
 
-    /* Check to see if a key frame is signalled
+    /* Check to see if a key frame is signaled
      * For two pass with auto key frame enabled cm->frame_type may already
      * be set, but not for one pass.
      */
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -91,18 +91,8 @@
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
 
-# TODO(johann) make this generic
-ifeq ($(HAVE_SSE2),yes)
-vp8/encoder/x86/quantize_sse2.c.o: CFLAGS += -msse2
-vp8/encoder/x86/quantize_sse2.c.d: CFLAGS += -msse2
-endif
-
 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
-ifeq ($(HAVE_SSE2),yes)
-vp8/encoder/x86/denoising_sse2.c.o: CFLAGS += -msse2
-vp8/encoder/x86/denoising_sse2.c.d: CFLAGS += -msse2
-endif
 endif
 
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
--- a/vp9/common/ppc/vp9_copy_altivec.asm
+++ /dev/null
@@ -1,47 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl copy_mem16x16_ppc
-
-;# r3 unsigned char *src
-;# r4 int src_stride
-;# r5 unsigned char *dst
-;# r6 int dst_stride
-
-;# Make the assumption that input will not be aligned,
-;#  but the output will be.  So two reads and a perm
-;#  for the input, but only one store for the output.
-copy_mem16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xe000
-    mtspr   256, r12            ;# set VRSAVE
-
-    li      r10, 16
-    mtctr   r10
-
-cp_16x16_loop:
-    lvsl    v0,  0, r3          ;# permutate value for alignment
-
-    lvx     v1,   0, r3
-    lvx     v2, r10, r3
-
-    vperm   v1, v1, v2, v0
-
-    stvx    v1,  0, r5
-
-    add     r3, r3, r4          ;# increment source pointer
-    add     r5, r5, r6          ;# increment destination pointer
-
-    bdnz    cp_16x16_loop
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
--- a/vp9/common/ppc/vp9_filter_altivec.asm
+++ /dev/null
@@ -1,1013 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl sixtap_predict_ppc
-    .globl sixtap_predict8x4_ppc
-    .globl sixtap_predict8x8_ppc
-    .globl sixtap_predict16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-.macro load_hfilter V0, V1
-    load_c \V0, HFilter, r5, r9, r10
-
-    addi    r5,  r5, 16
-    lvx     \V1, r5, r10
-.endm
-
-;# Vertical filtering
-.macro Vprolog
-    load_c v0, VFilter, r6, r3, r10
-
-    vspltish v5, 8
-    vspltish v6, 3
-    vslh    v6, v5, v6      ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    vspltb  v1, v0, 1
-    vspltb  v2, v0, 2
-    vspltb  v3, v0, 3
-    vspltb  v4, v0, 4
-    vspltb  v5, v0, 5
-    vspltb  v0, v0, 0
-.endm
-
-.macro vpre_load
-    Vprolog
-    li      r10,  16
-    lvx     v10,   0, r9    ;# v10..v14 = first 5 rows
-    lvx     v11, r10, r9
-    addi    r9,   r9, 32
-    lvx     v12,   0, r9
-    lvx     v13, r10, r9
-    addi    r9,   r9, 32
-    lvx     v14,   0, r9
-.endm
-
-.macro Msum Re, Ro, V, T, TMP
-                                ;# (Re,Ro) += (V*T)
-    vmuleub \TMP, \V, \T        ;# trashes v8
-    vadduhm \Re, \Re, \TMP      ;# Re = evens, saturation unnecessary
-    vmuloub \TMP, \V, \T
-    vadduhm \Ro, \Ro, \TMP      ;# Ro = odds
-.endm
-
-.macro vinterp_no_store P0 P1 P2 P3 P4 P5
-    vmuleub  v8, \P0, v0        ;# 64 + 4 positive taps
-    vadduhm v16, v6, v8
-    vmuloub  v8, \P0, v0
-    vadduhm v17, v6, v8
-    Msum v16, v17, \P2, v2, v8
-    Msum v16, v17, \P3, v3, v8
-    Msum v16, v17, \P5, v5, v8
-
-    vmuleub v18, \P1, v1        ;# 2 negative taps
-    vmuloub v19, \P1, v1
-    Msum v18, v19, \P4, v4, v8
-
-    vsubuhs v16, v16, v18       ;# subtract neg from pos
-    vsubuhs v17, v17, v19
-    vsrh    v16, v16, v7        ;# divide by 128
-    vsrh    v17, v17, v7        ;# v16 v17 = evens, odds
-    vmrghh  v18, v16, v17       ;# v18 v19 = 16-bit result in order
-    vmrglh  v19, v16, v17
-    vpkuhus  \P0, v18, v19      ;# P0 = 8-bit result
-.endm
-
-.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5
-    vmuleub v24, \P0, v13       ;# 64 + 4 positive taps
-    vadduhm v21, v20, v24
-    vmuloub v24, \P0, v13
-    vadduhm v22, v20, v24
-    Msum v21, v22, \P2, v15, v25
-    Msum v21, v22, \P3, v16, v25
-    Msum v21, v22, \P5, v18, v25
-
-    vmuleub v23, \P1, v14       ;# 2 negative taps
-    vmuloub v24, \P1, v14
-    Msum v23, v24, \P4, v17, v25
-
-    vsubuhs v21, v21, v23       ;# subtract neg from pos
-    vsubuhs v22, v22, v24
-    vsrh    v21, v21, v19       ;# divide by 128
-    vsrh    v22, v22, v19       ;# v16 v17 = evens, odds
-    vmrghh  v23, v21, v22       ;# v18 v19 = 16-bit result in order
-    vmrglh  v24, v21, v22
-    vpkuhus \P0, v23, v24       ;# P0 = 8-bit result
-.endm
-
-
-.macro Vinterp P0 P1 P2 P3 P4 P5
-    vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5
-    stvx    \P0, 0, r7
-    add     r7, r7, r8      ;# 33 ops per 16 pels
-.endm
-
-
-.macro luma_v P0, P1, P2, P3, P4, P5
-    addi    r9,   r9, 16        ;# P5 = newest input row
-    lvx     \P5,   0, r9
-    Vinterp \P0, \P1, \P2, \P3, \P4, \P5
-.endm
-
-.macro luma_vtwo
-    luma_v v10, v11, v12, v13, v14, v15
-    luma_v v11, v12, v13, v14, v15, v10
-.endm
-
-.macro luma_vfour
-    luma_vtwo
-    luma_v v12, v13, v14, v15, v10, v11
-    luma_v v13, v14, v15, v10, v11, v12
-.endm
-
-.macro luma_vsix
-    luma_vfour
-    luma_v v14, v15, v10, v11, v12, v13
-    luma_v v15, v10, v11, v12, v13, v14
-.endm
-
-.macro Interp4 R I I4
-    vmsummbm \R, v13, \I, v15
-    vmsummbm \R, v14, \I4, \R
-.endm
-
-.macro Read8x8 VD, RS, RP, increment_counter
-    lvsl    v21,  0, \RS        ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     \VD,   0, \RS
-    lvx     v20, r10, \RS
-
-.if \increment_counter
-    add     \RS, \RS, \RP
-.endif
-
-    vperm   \VD, \VD, v20, v21
-.endm
-
-.macro interp_8x8 R
-    vperm   v20, \R, \R, v16    ;# v20 = 0123 1234 2345 3456
-    vperm   v21, \R, \R, v17    ;# v21 = 4567 5678 6789 789A
-    Interp4 v20, v20,  v21      ;# v20 = result 0 1 2 3
-    vperm   \R, \R, \R, v18     ;# R   = 89AB 9ABC ABCx BCxx
-    Interp4 v21, v21, \R        ;# v21 = result 4 5 6 7
-
-    vpkswus \R, v20, v21        ;#  R = 0 1 2 3 4 5 6 7
-    vsrh    \R, \R, v19
-
-    vpkuhus \R, \R, \R          ;# saturate and pack
-
-.endm
-
-.macro Read4x4 VD, RS, RP, increment_counter
-    lvsl    v21,  0, \RS        ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v20,   0, \RS
-
-.if \increment_counter
-    add     \RS, \RS, \RP
-.endif
-
-    vperm   \VD, v20, v20, v21
-.endm
-    .text
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-sixtap_predict_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xff87
-    ori     r12, r12, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    slwi.   r5, r5, 5           ;# index into horizontal filter array
-
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq-    vertical_only_4x4
-
-    ;# load up horizontal filter
-    load_hfilter v13, v14
-
-    ;# rounding added in on the multiply
-    vspltisw v16, 8
-    vspltisw v15, 3
-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
-
-    ;# Load up permutation constants
-    load_c v16, B_0123, 0, r9, r10
-    load_c v17, B_4567, 0, r9, r10
-    load_c v18, B_89AB, 0, r9, r10
-
-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
-    addi    r3, r3, -2
-
-    addi    r9, r3, 0
-    li      r10, 16
-    Read8x8 v2, r3, r4, 1
-    Read8x8 v3, r3, r4, 1
-    Read8x8 v4, r3, r4, 1
-    Read8x8 v5, r3, r4, 1
-
-    slwi.   r6, r6, 4           ;# index into vertical filter array
-
-    ;# filter a line
-    interp_8x8 v2
-    interp_8x8 v3
-    interp_8x8 v4
-    interp_8x8 v5
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional 5 lines that are needed
-    ;#  for the vertical filter.
-    beq-    store_4x4
-
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r9, r9, r4
-    sub     r9, r9, r4
-
-    Read8x8 v0, r9, r4, 1
-    Read8x8 v1, r9, r4, 0
-    Read8x8 v6, r3, r4, 1
-    Read8x8 v7, r3, r4, 1
-    Read8x8 v8, r3, r4, 0
-
-    interp_8x8 v0
-    interp_8x8 v1
-    interp_8x8 v6
-    interp_8x8 v7
-    interp_8x8 v8
-
-    b       second_pass_4x4
-
-vertical_only_4x4:
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-    li      r10, 16
-
-    Read8x8 v0, r3, r4, 1
-    Read8x8 v1, r3, r4, 1
-    Read8x8 v2, r3, r4, 1
-    Read8x8 v3, r3, r4, 1
-    Read8x8 v4, r3, r4, 1
-    Read8x8 v5, r3, r4, 1
-    Read8x8 v6, r3, r4, 1
-    Read8x8 v7, r3, r4, 1
-    Read8x8 v8, r3, r4, 0
-
-    slwi    r6, r6, 4           ;# index into vertical filter array
-
-second_pass_4x4:
-    load_c   v20, b_hilo_4x4, 0, r9, r10
-    load_c   v21, b_hilo, 0, r9, r10
-
-    ;# reposition input so that it can go through the
-    ;# filtering phase with one pass.
-    vperm   v0, v0, v1, v20     ;# 0 1 x x
-    vperm   v2, v2, v3, v20     ;# 2 3 x x
-    vperm   v4, v4, v5, v20     ;# 4 5 x x
-    vperm   v6, v6, v7, v20     ;# 6 7 x x
-
-    vperm   v0, v0, v2, v21     ;# 0 1 2 3
-    vperm   v4, v4, v6, v21     ;# 4 5 6 7
-
-    vsldoi  v1, v0, v4, 4
-    vsldoi  v2, v0, v4, 8
-    vsldoi  v3, v0, v4, 12
-
-    vsldoi  v5, v4, v8, 4
-
-    load_c   v13, VFilter, r6, r9, r10
-
-    vspltish v15, 8
-    vspltish v20, 3
-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    vspltb  v14, v13, 1
-    vspltb  v15, v13, 2
-    vspltb  v16, v13, 3
-    vspltb  v17, v13, 4
-    vspltb  v18, v13, 5
-    vspltb  v13, v13, 0
-
-    vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
-
-    stvx    v0, 0, r1
-
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    lwz     r0, 4(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    lwz     r0, 8(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    lwz     r0, 12(r1)
-    stw     r0, 0(r7)
-
-    b       exit_4x4
-
-store_4x4:
-
-    stvx    v2, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v3, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v4, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v5, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-
-exit_4x4:
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-.macro w_8x8 V, D, R, P
-    stvx    \V, 0, r1
-    lwz     \R, 0(r1)
-    stw     \R, 0(r7)
-    lwz     \R, 4(r1)
-    stw     \R, 4(r7)
-    add     \D, \D, \P
-.endm
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-sixtap_predict8x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    slwi.   r5, r5, 5           ;# index into horizontal filter array
-
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq-    second_pass_pre_copy_8x4
-
-    load_hfilter v13, v14
-
-    ;# rounding added in on the multiply
-    vspltisw v16, 8
-    vspltisw v15, 3
-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
-
-    ;# Load up permutation constants
-    load_c v16, B_0123, 0, r9, r10
-    load_c v17, B_4567, 0, r9, r10
-    load_c v18, B_89AB, 0, r9, r10
-
-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
-    addi    r3, r3, -2
-
-    addi    r9, r3, 0
-    li      r10, 16
-    Read8x8 v2, r3, r4, 1
-    Read8x8 v3, r3, r4, 1
-    Read8x8 v4, r3, r4, 1
-    Read8x8 v5, r3, r4, 1
-
-    slwi.   r6, r6, 4           ;# index into vertical filter array
-
-    ;# filter a line
-    interp_8x8 v2
-    interp_8x8 v3
-    interp_8x8 v4
-    interp_8x8 v5
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional 5 lines that are needed
-    ;#  for the vertical filter.
-    beq-    store_8x4
-
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r9, r9, r4
-    sub     r9, r9, r4
-
-    Read8x8 v0, r9, r4, 1
-    Read8x8 v1, r9, r4, 0
-    Read8x8 v6, r3, r4, 1
-    Read8x8 v7, r3, r4, 1
-    Read8x8 v8, r3, r4, 0
-
-    interp_8x8 v0
-    interp_8x8 v1
-    interp_8x8 v6
-    interp_8x8 v7
-    interp_8x8 v8
-
-    b       second_pass_8x4
-
-second_pass_pre_copy_8x4:
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-    li      r10, 16
-
-    Read8x8 v0,  r3, r4, 1
-    Read8x8 v1,  r3, r4, 1
-    Read8x8 v2,  r3, r4, 1
-    Read8x8 v3,  r3, r4, 1
-    Read8x8 v4,  r3, r4, 1
-    Read8x8 v5,  r3, r4, 1
-    Read8x8 v6,  r3, r4, 1
-    Read8x8 v7,  r3, r4, 1
-    Read8x8 v8,  r3, r4, 1
-
-    slwi    r6, r6, 4           ;# index into vertical filter array
-
-second_pass_8x4:
-    load_c v13, VFilter, r6, r9, r10
-
-    vspltish v15, 8
-    vspltish v20, 3
-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    vspltb  v14, v13, 1
-    vspltb  v15, v13, 2
-    vspltb  v16, v13, 3
-    vspltb  v17, v13, 4
-    vspltb  v18, v13, 5
-    vspltb  v13, v13, 0
-
-    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
-    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
-    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
-    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x4
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-
-    b       exit_8x4
-
-store_aligned_8x4:
-
-    load_c v10, b_hilo, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-
-    b       exit_8x4
-
-store_8x4:
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned2_8x4
-
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-
-    b       exit_8x4
-
-store_aligned2_8x4:
-    load_c v10, b_hilo, 0, r9, r10
-
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-
-exit_8x4:
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-;# Because the width that needs to be filtered will fit in a single altivec
-;#  register there is no need to loop.  Everything can stay in registers.
-sixtap_predict8x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    slwi.   r5, r5, 5           ;# index into horizontal filter array
-
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq-    second_pass_pre_copy_8x8
-
-    load_hfilter v13, v14
-
-    ;# rounding added in on the multiply
-    vspltisw v16, 8
-    vspltisw v15, 3
-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
-
-    ;# Load up permutation constants
-    load_c v16, B_0123, 0, r9, r10
-    load_c v17, B_4567, 0, r9, r10
-    load_c v18, B_89AB, 0, r9, r10
-
-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
-    addi    r3, r3, -2
-
-    addi    r9, r3, 0
-    li      r10, 16
-    Read8x8 v2, r3, r4, 1
-    Read8x8 v3, r3, r4, 1
-    Read8x8 v4, r3, r4, 1
-    Read8x8 v5, r3, r4, 1
-    Read8x8 v6, r3, r4, 1
-    Read8x8 v7, r3, r4, 1
-    Read8x8 v8, r3, r4, 1
-    Read8x8 v9, r3, r4, 1
-
-    slwi.   r6, r6, 4           ;# index into vertical filter array
-
-    ;# filter a line
-    interp_8x8 v2
-    interp_8x8 v3
-    interp_8x8 v4
-    interp_8x8 v5
-    interp_8x8 v6
-    interp_8x8 v7
-    interp_8x8 v8
-    interp_8x8 v9
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional 5 lines that are needed
-    ;#  for the vertical filter.
-    beq-    store_8x8
-
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r9, r9, r4
-    sub     r9, r9, r4
-
-    Read8x8 v0,  r9, r4, 1
-    Read8x8 v1,  r9, r4, 0
-    Read8x8 v10, r3, r4, 1
-    Read8x8 v11, r3, r4, 1
-    Read8x8 v12, r3, r4, 0
-
-    interp_8x8 v0
-    interp_8x8 v1
-    interp_8x8 v10
-    interp_8x8 v11
-    interp_8x8 v12
-
-    b       second_pass_8x8
-
-second_pass_pre_copy_8x8:
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-    li      r10, 16
-
-    Read8x8 v0,  r3, r4, 1
-    Read8x8 v1,  r3, r4, 1
-    Read8x8 v2,  r3, r4, 1
-    Read8x8 v3,  r3, r4, 1
-    Read8x8 v4,  r3, r4, 1
-    Read8x8 v5,  r3, r4, 1
-    Read8x8 v6,  r3, r4, 1
-    Read8x8 v7,  r3, r4, 1
-    Read8x8 v8,  r3, r4, 1
-    Read8x8 v9,  r3, r4, 1
-    Read8x8 v10, r3, r4, 1
-    Read8x8 v11, r3, r4, 1
-    Read8x8 v12, r3, r4, 0
-
-    slwi    r6, r6, 4           ;# index into vertical filter array
-
-second_pass_8x8:
-    load_c v13, VFilter, r6, r9, r10
-
-    vspltish v15, 8
-    vspltish v20, 3
-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    vspltb  v14, v13, 1
-    vspltb  v15, v13, 2
-    vspltb  v16, v13, 3
-    vspltb  v17, v13, 4
-    vspltb  v18, v13, 5
-    vspltb  v13, v13, 0
-
-    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
-    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
-    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
-    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
-    vinterp_no_store_8x8 v4, v5, v6, v7,  v8,  v9
-    vinterp_no_store_8x8 v5, v6, v7, v8,  v9,  v10
-    vinterp_no_store_8x8 v6, v7, v8, v9,  v10, v11
-    vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x8
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-    w_8x8   v6, r7, r0, r8
-    w_8x8   v7, r7, r0, r8
-
-    b       exit_8x8
-
-store_aligned_8x8:
-
-    load_c v10, b_hilo, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-    vperm   v6, v6, v7, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-    addi    r7, r7, 16
-    stvx    v6, 0, r7
-
-    b       exit_8x8
-
-store_8x8:
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned2_8x8
-
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-    w_8x8   v6, r7, r0, r8
-    w_8x8   v7, r7, r0, r8
-    w_8x8   v8, r7, r0, r8
-    w_8x8   v9, r7, r0, r8
-
-    b       exit_8x8
-
-store_aligned2_8x8:
-    load_c v10, b_hilo, 0, r9, r10
-
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-    vperm   v6, v6, v7, v10
-    vperm   v8, v8, v9, v10
-
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-    addi    r7, r7, 16
-    stvx    v6, 0, r7
-    addi    r7, r7, 16
-    stvx    v8, 0, r7
-
-exit_8x8:
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-;# Two pass filtering.  First pass is Horizontal edges, second pass is vertical
-;#  edges.  One of the filters can be null, but both won't be.  Needs to use a
-;#  temporary buffer because the source buffer can't be modified and the buffer
-;#  for the destination is not large enough to hold the temporary data.
-sixtap_predict16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xf000
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-416(r1)         ;# create space on the stack
-
-    ;# Three possiblities
-    ;#  1. First filter is null.  Don't use a temp buffer.
-    ;#  2. Second filter is null.  Don't use a temp buffer.
-    ;#  3. Neither are null, use temp buffer.
-
-    ;# First Pass (horizontal edge)
-    ;#  setup pointers for src
-    ;#  if possiblity (1) then setup the src pointer to be the orginal and jump
-    ;#  to second pass.  this is based on if x_offset is 0.
-
-    ;# load up horizontal filter
-    slwi.   r5, r5, 5           ;# index into horizontal filter array
-
-    load_hfilter v4, v5
-
-    beq-    copy_horizontal_16x21
-
-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
-    addi    r3, r3, -2
-
-    slwi.   r6, r6, 4           ;# index into vertical filter array
-
-    ;# setup constants
-    ;# v14 permutation value for alignment
-    load_c v14, b_hperm, 0, r9, r10
-
-    ;# These statements are guessing that there won't be a second pass,
-    ;#  but if there is then inside the bypass they need to be set
-    li      r0, 16              ;# prepare for no vertical filter
-
-    ;# Change the output pointer and pitch to be the actual
-    ;#  desination instead of a temporary buffer.
-    addi    r9, r7, 0
-    addi    r5, r8, 0
-
-    ;# no vertical filter, so write the output from the first pass
-    ;#  directly into the output buffer.
-    beq-    no_vertical_filter_bypass
-
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-
-    ;# setup counter for the number of lines that are going to be filtered
-    li      r0, 21
-
-    ;# use the stack as temporary storage
-    la      r9, 48(r1)
-    li      r5, 16
-
-no_vertical_filter_bypass:
-
-    mtctr   r0
-
-    ;# rounding added in on the multiply
-    vspltisw v10, 8
-    vspltisw v12, 3
-    vslw    v12, v10, v12       ;# 0x00000040000000400000004000000040
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v13, 7
-
-    ;# index to the next set of vectors in the row.
-    li      r10, 16
-    li      r12, 32
-
-horizontal_loop_16x16:
-
-    lvsl    v15,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v1,   0, r3
-    lvx     v2, r10, r3
-    lvx     v3, r12, r3
-
-    vperm   v8, v1, v2, v15
-    vperm   v9, v2, v3, v15     ;# v8 v9 = 21 input pixels left-justified
-
-    vsldoi  v11, v8, v9, 4
-
-    ;# set 0
-    vmsummbm v6, v4, v8, v12    ;# taps times elements
-    vmsummbm v0, v5, v11, v6
-
-    ;# set 1
-    vsldoi  v10, v8, v9, 1
-    vsldoi  v11, v8, v9, 5
-
-    vmsummbm v6, v4, v10, v12
-    vmsummbm v1, v5, v11, v6
-
-    ;# set 2
-    vsldoi  v10, v8, v9, 2
-    vsldoi  v11, v8, v9, 6
-
-    vmsummbm v6, v4, v10, v12
-    vmsummbm v2, v5, v11, v6
-
-    ;# set 3
-    vsldoi  v10, v8, v9, 3
-    vsldoi  v11, v8, v9, 7
-
-    vmsummbm v6, v4, v10, v12
-    vmsummbm v3, v5, v11, v6
-
-    vpkswus v0, v0, v1          ;# v0 = 0 4 8 C 1 5 9 D (16-bit)
-    vpkswus v1, v2, v3          ;# v1 = 2 6 A E 3 7 B F
-
-    vsrh    v0, v0, v13         ;# divide v0, v1 by 128
-    vsrh    v1, v1, v13
-
-    vpkuhus v0, v0, v1          ;# v0 = scrambled 8-bit result
-    vperm   v0, v0, v0, v14     ;# v0 = correctly-ordered result
-
-    stvx    v0,  0, r9
-    add     r9, r9, r5
-
-    add     r3, r3, r4
-
-    bdnz    horizontal_loop_16x16
-
-    ;# check again to see if vertical filter needs to be done.
-    cmpi    cr0, r6, 0
-    beq     cr0, end_16x16
-
-    ;# yes there is, so go to the second pass
-    b       second_pass_16x16
-
-copy_horizontal_16x21:
-    li      r10, 21
-    mtctr   r10
-
-    li      r10, 16
-
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-
-    ;# this is done above if there is a horizontal filter,
-    ;#  if not it needs to be done down here.
-    slwi    r6, r6, 4           ;# index into vertical filter array
-
-    ;# always write to the stack when doing a horizontal copy
-    la      r9, 48(r1)
-
-copy_horizontal_loop_16x21:
-    lvsl    v15,  0, r3         ;# permutate value for alignment
-
-    lvx     v1,   0, r3
-    lvx     v2, r10, r3
-
-    vperm   v8, v1, v2, v15
-
-    stvx    v8,  0, r9
-    addi    r9, r9, 16
-
-    add     r3, r3, r4
-
-    bdnz    copy_horizontal_loop_16x21
-
-second_pass_16x16:
-
-    ;# always read from the stack when doing a vertical filter
-    la      r9, 48(r1)
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v7, 7
-
-    vpre_load
-
-    luma_vsix
-    luma_vsix
-    luma_vfour
-
-end_16x16:
-
-    addi    r1, r1, 416         ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-    .align 4
-HFilter:
-    .byte     0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0
-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12
-    .byte    -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0
-    .byte     2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36
-    .byte    -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0
-    .byte     0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50
-    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
-    .byte     3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77
-    .byte   -16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0
-    .byte     0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93
-    .byte    -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0
-    .byte     1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108
-    .byte   -11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0
-    .byte     0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123
-    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
-
-    .align 4
-VFilter:
-    .byte     0,  0,128,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0,  6,123, 12,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     2, 11,108, 36,  8,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0,  9, 93, 50,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     3, 16, 77, 77, 16,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0,  6, 50, 93,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     1,  8, 36,108, 11,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0,  1, 12,123,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-
-    .align 4
-b_hperm:
-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
-
-    .align 4
-B_0123:
-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
-
-    .align 4
-B_4567:
-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
-
-    .align 4
-B_89AB:
-    .byte     8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-
-    .align 4
-b_hilo:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
-
-    .align 4
-b_hilo_4x4:
-    .byte     0,  1,  2,  3, 16, 17, 18, 19,  0,  0,  0,  0,  0,  0,  0,  0
--- a/vp9/common/ppc/vp9_filter_bilinear_altivec.asm
+++ /dev/null
@@ -1,677 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl bilinear_predict4x4_ppc
-    .globl bilinear_predict8x4_ppc
-    .globl bilinear_predict8x8_ppc
-    .globl bilinear_predict16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-.macro load_vfilter V0, V1
-    load_c \V0, vfilter_b, r6, r9, r10
-
-    addi    r6,  r6, 16
-    lvx     \V1, r6, r10
-.endm
-
-.macro HProlog jump_label
-    ;# load up horizontal filter
-    slwi.   r5, r5, 4           ;# index into horizontal filter array
-
-    ;# index to the next set of vectors in the row.
-    li      r10, 16
-    li      r12, 32
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq     \jump_label
-
-    load_c v20, hfilter_b, r5, r9, r0
-
-    ;# setup constants
-    ;# v14 permutation value for alignment
-    load_c v28, b_hperm_b, 0, r9, r0
-
-    ;# rounding added in on the multiply
-    vspltisw v21, 8
-    vspltisw v18, 3
-    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
-
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-.endm
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-.macro HFilter V
-    vperm   v24, v21, v21, v10  ;# v20 = 0123 1234 2345 3456
-    vperm   v25, v21, v21, v11  ;# v21 = 4567 5678 6789 789A
-
-    vmsummbm v24, v20, v24, v18
-    vmsummbm v25, v20, v25, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-
-    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
-.endm
-
-.macro hfilter_8 V, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 9 bytes wide, output is 8 bytes.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-
-    HFilter \V
-.endm
-
-
-.macro load_and_align_8 V, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-
-    vperm   \V, v21, v22, v17
-.endm
-
-.macro write_aligned_8 V, increment_counter
-    stvx    \V,  0, r7
-
-.if \increment_counter
-    add     r7, r7, r8
-.endif
-.endm
-
-.macro vfilter_16 P0 P1
-    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
-    vadduhm v22, v18, v22
-    vmuloub v23, \P0, v20
-    vadduhm v23, v18, v23
-
-    vmuleub v24, \P1, v21
-    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
-    vmuloub v25, \P1, v21
-    vadduhm v23, v23, v25       ;# Ro = odds
-
-    vsrh    v22, v22, v19       ;# divide by 128
-    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
-    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
-    vmrglh  v23, v22, v23
-    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
-.endm
-
-
-.macro w_8x8 V, D, R, P
-    stvx    \V, 0, r1
-    lwz     \R, 0(r1)
-    stw     \R, 0(r7)
-    lwz     \R, 4(r1)
-    stw     \R, 4(r7)
-    add     \D, \D, \P
-.endm
-
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict4x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf830
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_4x4_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r9, r12
-    load_c v11, b_4567_b, 0, r9, r12
-
-    hfilter_8 v0, 1
-    hfilter_8 v1, 1
-    hfilter_8 v2, 1
-    hfilter_8 v3, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_4x4_b
-
-    hfilter_8 v4, 0
-
-    b   second_pass_4x4_b
-
-second_pass_4x4_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_8  v0, 1
-    load_and_align_8  v1, 1
-    load_and_align_8  v2, 1
-    load_and_align_8  v3, 1
-    load_and_align_8  v4, 1
-
-second_pass_4x4_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-
-store_out_4x4_b:
-
-    stvx    v0, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v1, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v2, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v3, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-
-exit_4x4:
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict8x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf830
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x4_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r9, r12
-    load_c v11, b_4567_b, 0, r9, r12
-
-    hfilter_8 v0, 1
-    hfilter_8 v1, 1
-    hfilter_8 v2, 1
-    hfilter_8 v3, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_8x4_b
-
-    hfilter_8 v4, 0
-
-    b   second_pass_8x4_b
-
-second_pass_8x4_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_8  v0, 1
-    load_and_align_8  v1, 1
-    load_and_align_8  v2, 1
-    load_and_align_8  v3, 1
-    load_and_align_8  v4, 1
-
-second_pass_8x4_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-
-store_out_8x4_b:
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x4_b
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-
-    b       exit_8x4
-
-store_aligned_8x4_b:
-    load_c v10, b_hilo_b, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-
-exit_8x4:
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict8x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfff0
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x8_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r9, r12
-    load_c v11, b_4567_b, 0, r9, r12
-
-    hfilter_8 v0, 1
-    hfilter_8 v1, 1
-    hfilter_8 v2, 1
-    hfilter_8 v3, 1
-    hfilter_8 v4, 1
-    hfilter_8 v5, 1
-    hfilter_8 v6, 1
-    hfilter_8 v7, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_8x8_b
-
-    hfilter_8 v8, 0
-
-    b   second_pass_8x8_b
-
-second_pass_8x8_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_8  v0, 1
-    load_and_align_8  v1, 1
-    load_and_align_8  v2, 1
-    load_and_align_8  v3, 1
-    load_and_align_8  v4, 1
-    load_and_align_8  v5, 1
-    load_and_align_8  v6, 1
-    load_and_align_8  v7, 1
-    load_and_align_8  v8, 0
-
-second_pass_8x8_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-
-store_out_8x8_b:
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x8_b
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-    w_8x8   v6, r7, r0, r8
-    w_8x8   v7, r7, r0, r8
-
-    b       exit_8x8
-
-store_aligned_8x8_b:
-    load_c v10, b_hilo_b, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-    vperm   v6, v6, v7, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-    addi    r7, r7, 16
-    stvx    v6, 0, r7
-
-exit_8x8:
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-.macro hfilter_16 V, increment_counter
-
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-    lvx     v23, r12, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
-
-    ;# set 0
-    vmsummbm v24, v20, v21, v18 ;# taps times elements
-
-    ;# set 1
-    vsldoi  v23, v21, v22, 1
-    vmsummbm v25, v20, v23, v18
-
-    ;# set 2
-    vsldoi  v23, v21, v22, 2
-    vmsummbm v26, v20, v23, v18
-
-    ;# set 3
-    vsldoi  v23, v21, v22, 3
-    vmsummbm v27, v20, v23, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-    vsrh    v25, v25, v19
-
-    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
-    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
-.endm
-
-.macro load_and_align_16 V, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-
-    vperm   \V, v21, v22, v17
-.endm
-
-.macro write_16 V, increment_counter
-    stvx    \V,  0, r7
-
-.if \increment_counter
-    add     r7, r7, r8
-.endif
-.endm
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    HProlog second_pass_16x16_pre_copy_b
-
-    hfilter_16 v0,  1
-    hfilter_16 v1,  1
-    hfilter_16 v2,  1
-    hfilter_16 v3,  1
-    hfilter_16 v4,  1
-    hfilter_16 v5,  1
-    hfilter_16 v6,  1
-    hfilter_16 v7,  1
-    hfilter_16 v8,  1
-    hfilter_16 v9,  1
-    hfilter_16 v10, 1
-    hfilter_16 v11, 1
-    hfilter_16 v12, 1
-    hfilter_16 v13, 1
-    hfilter_16 v14, 1
-    hfilter_16 v15, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_16x16_b
-
-    hfilter_16 v16, 0
-
-    b   second_pass_16x16_b
-
-second_pass_16x16_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16  v0,  1
-    load_and_align_16  v1,  1
-    load_and_align_16  v2,  1
-    load_and_align_16  v3,  1
-    load_and_align_16  v4,  1
-    load_and_align_16  v5,  1
-    load_and_align_16  v6,  1
-    load_and_align_16  v7,  1
-    load_and_align_16  v8,  1
-    load_and_align_16  v9,  1
-    load_and_align_16  v10, 1
-    load_and_align_16  v11, 1
-    load_and_align_16  v12, 1
-    load_and_align_16  v13, 1
-    load_and_align_16  v14, 1
-    load_and_align_16  v15, 1
-    load_and_align_16  v16, 0
-
-second_pass_16x16_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-    vfilter_16 v8,  v9
-    vfilter_16 v9,  v10
-    vfilter_16 v10, v11
-    vfilter_16 v11, v12
-    vfilter_16 v12, v13
-    vfilter_16 v13, v14
-    vfilter_16 v14, v15
-    vfilter_16 v15, v16
-
-store_out_16x16_b:
-
-    write_16 v0,  1
-    write_16 v1,  1
-    write_16 v2,  1
-    write_16 v3,  1
-    write_16 v4,  1
-    write_16 v5,  1
-    write_16 v6,  1
-    write_16 v7,  1
-    write_16 v8,  1
-    write_16 v9,  1
-    write_16 v10, 1
-    write_16 v11, 1
-    write_16 v12, 1
-    write_16 v13, 1
-    write_16 v14, 1
-    write_16 v15, 0
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-    .align 4
-hfilter_b:
-    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
-    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
-    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
-    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
-    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
-    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
-    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
-    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
-
-    .align 4
-vfilter_b:
-    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-
-    .align 4
-b_hperm_b:
-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
-
-    .align 4
-b_0123_b:
-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
-
-    .align 4
-b_4567_b:
-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
-
-b_hilo_b:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
--- a/vp9/common/ppc/vp9_idct_altivec.asm
+++ /dev/null
@@ -1,189 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl short_idct4x4_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-    .align 2
-short_idct4x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    load_c v8, sinpi8sqrt2, 0, r9, r10
-    load_c v9, cospi8sqrt2minus1, 0, r9, r10
-    load_c v10, hi_hi, 0, r9, r10
-    load_c v11, lo_lo, 0, r9, r10
-    load_c v12, shift_16, 0, r9, r10
-
-    li      r10,  16
-    lvx     v0,   0, r3         ;# input ip[0], ip[ 4]
-    lvx     v1, r10, r3         ;# input ip[8], ip[12]
-
-    ;# first pass
-    vupkhsh v2, v0
-    vupkhsh v3, v1
-    vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8]
-    vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8]
-
-    vupklsh v0, v0
-    vmulosh v4, v0, v8
-    vsraw   v4, v4, v12
-    vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2)
-
-    vupklsh v1, v1
-    vmulosh v5, v1, v9
-    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v1
-
-    vsubsws v4, v4, v5          ;# c1
-
-    vmulosh v3, v1, v8
-    vsraw   v3, v3, v12
-    vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2)
-
-    vmulosh v5, v0, v9
-    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v0
-
-    vaddsws v3, v3, v5          ;# d1
-
-    vaddsws v0, v6, v3          ;# a1 + d1
-    vsubsws v3, v6, v3          ;# a1 - d1
-
-    vaddsws v1, v7, v4          ;# b1 + c1
-    vsubsws v2, v7, v4          ;# b1 - c1
-
-    ;# transpose input
-    vmrghw  v4, v0, v1          ;# a0 b0 a1 b1
-    vmrghw  v5, v2, v3          ;# c0 d0 c1 d1
-
-    vmrglw  v6, v0, v1          ;# a2 b2 a3 b3
-    vmrglw  v7, v2, v3          ;# c2 d2 c3 d3
-
-    vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0
-    vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1
-
-    vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2
-    vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3
-
-    ;# second pass
-    vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8]
-    vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8]
-
-    vmulosh v4, v1, v8
-    vsraw   v4, v4, v12
-    vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2)
-
-    vmulosh v5, v3, v9
-    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v3
-
-    vsubsws v4, v4, v5          ;# c1
-
-    vmulosh v2, v3, v8
-    vsraw   v2, v2, v12
-    vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2)
-
-    vmulosh v5, v1, v9
-    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v1
-
-    vaddsws v3, v2, v5          ;# d1
-
-    vaddsws v0, v6, v3          ;# a1 + d1
-    vsubsws v3, v6, v3          ;# a1 - d1
-
-    vaddsws v1, v7, v4          ;# b1 + c1
-    vsubsws v2, v7, v4          ;# b1 - c1
-
-    vspltish v6, 4
-    vspltish v7, 3
-
-    vpkswss v0, v0, v1
-    vpkswss v1, v2, v3
-
-    vaddshs v0, v0, v6
-    vaddshs v1, v1, v6
-
-    vsrah   v0, v0, v7
-    vsrah   v1, v1, v7
-
-    ;# transpose output
-    vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3
-    vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3
-
-    vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1
-    vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3
-
-    stwu    r1,-416(r1)         ;# create space on the stack
-
-    stvx    v0,  0, r1
-    lwz     r6, 0(r1)
-    stw     r6, 0(r4)
-    lwz     r6, 4(r1)
-    stw     r6, 4(r4)
-
-    add     r4, r4, r5
-
-    lwz     r6,  8(r1)
-    stw     r6,  0(r4)
-    lwz     r6, 12(r1)
-    stw     r6,  4(r4)
-
-    add     r4, r4, r5
-
-    stvx    v1,  0, r1
-    lwz     r6, 0(r1)
-    stw     r6, 0(r4)
-    lwz     r6, 4(r1)
-    stw     r6, 4(r4)
-
-    add     r4, r4, r5
-
-    lwz     r6,  8(r1)
-    stw     r6,  0(r4)
-    lwz     r6, 12(r1)
-    stw     r6,  4(r4)
-
-    addi    r1, r1, 416         ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 4
-sinpi8sqrt2:
-    .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
-
-    .align 4
-cospi8sqrt2minus1:
-    .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
-
-    .align 4
-shift_16:
-    .long      16,    16,    16,    16
-
-    .align 4
-hi_hi:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
-
-    .align 4
-lo_lo:
-    .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
--- a/vp9/common/ppc/vp9_loopfilter_altivec.c
+++ /dev/null
@@ -1,127 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_onyxc_int.h"
-
-typedef void loop_filter_function_y_ppc
-(
-  unsigned char *s,   // source pointer
-  int p,              // pitch
-  const signed char *flimit,
-  const signed char *limit,
-  const signed char *thresh
-);
-
-typedef void loop_filter_function_uv_ppc
-(
-  unsigned char *u,   // source pointer
-  unsigned char *v,   // source pointer
-  int p,              // pitch
-  const signed char *flimit,
-  const signed char *limit,
-  const signed char *thresh
-);
-
-typedef void loop_filter_function_s_ppc
-(
-  unsigned char *s,   // source pointer
-  int p,              // pitch
-  const signed char *flimit
-);
-
-loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;
-loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;
-loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;
-loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;
-
-loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;
-loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;
-loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;
-loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;
-
-loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;
-loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;
-
-// Horizontal MB filtering
-void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi) {
-  mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
-
-  if (u_ptr)
-    mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi) {
-  (void)u_ptr;
-  (void)v_ptr;
-  (void)uv_stride;
-  loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);
-}
-
-// Vertical MB Filtering
-void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi) {
-  mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
-
-  if (u_ptr)
-    mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi) {
-  (void)u_ptr;
-  (void)v_ptr;
-  (void)uv_stride;
-  loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);
-}
-
-// Horizontal B Filtering
-void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                        int y_stride, int uv_stride, loop_filter_info *lfi) {
-  // These should all be done at once with one call, instead of 3
-  loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-  loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-  loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-
-  if (u_ptr)
-    loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi) {
-  (void)u_ptr;
-  (void)v_ptr;
-  (void)uv_stride;
-  loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);
-  loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);
-  loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);
-}
-
-// Vertical B Filtering
-void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                        int y_stride, int uv_stride, loop_filter_info *lfi) {
-  loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
-
-  if (u_ptr)
-    loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi) {
-  (void)u_ptr;
-  (void)v_ptr;
-  (void)uv_stride;
-  loop_filter_simple_vertical_edge_ppc(y_ptr + 4,  y_stride, lfi->flim);
-  loop_filter_simple_vertical_edge_ppc(y_ptr + 8,  y_stride, lfi->flim);
-  loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);
-}
--- a/vp9/common/ppc/vp9_loopfilter_filters_altivec.asm
+++ /dev/null
@@ -1,1253 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl mbloop_filter_horizontal_edge_y_ppc
-    .globl loop_filter_horizontal_edge_y_ppc
-    .globl mbloop_filter_vertical_edge_y_ppc
-    .globl loop_filter_vertical_edge_y_ppc
-
-    .globl mbloop_filter_horizontal_edge_uv_ppc
-    .globl loop_filter_horizontal_edge_uv_ppc
-    .globl mbloop_filter_vertical_edge_uv_ppc
-    .globl loop_filter_vertical_edge_uv_ppc
-
-    .globl loop_filter_simple_horizontal_edge_ppc
-    .globl loop_filter_simple_vertical_edge_ppc
-
-    .text
-;# We often need to perform transposes (and other transpose-like operations)
-;#   on matrices of data.  This is simplified by the fact that we usually
-;#   operate on hunks of data whose dimensions are powers of 2, or at least
-;#   divisible by highish powers of 2.
-;#
-;#   These operations can be very confusing.  They become more straightforward
-;#   when we think of them as permutations of address bits: Concatenate a
-;#   group of vector registers and think of it as occupying a block of
-;#   memory beginning at address zero.  The low four bits 0...3 of the
-;#   address then correspond to position within a register, the higher-order
-;#   address bits select the register.
-;#
-;#   Although register selection, at the code level, is arbitrary, things
-;#   are simpler if we use contiguous ranges of register numbers, simpler
-;#   still if the low-order bits of the register number correspond to
-;#   conceptual address bits.  We do this whenever reasonable.
-;#
-;#   A 16x16 transpose can then be thought of as an operation on
-;#   a 256-element block of memory.  It takes 8 bits 0...7 to address this
-;#   memory and the effect of a transpose is to interchange address bit
-;#   0 with 4, 1 with 5, 2 with 6, and 3 with 7.  Bits 0...3 index the
-;#   column, which is interchanged with the row addressed by bits 4..7.
-;#
-;#   The altivec merge instructions provide a rapid means of effecting
-;#   many of these transforms.  They operate at three widths (8,16,32).
-;#   Writing V(x) for vector register #x, paired merges permute address
-;#   indices as follows.
-;#
-;#   0->1  1->2  2->3  3->(4+d)  (4+s)->0:
-;#
-;#      vmrghb  V( x),          V( y), V( y + (1<<s))
-;#      vmrglb  V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;#   =0=   1->2  2->3  3->(4+d)  (4+s)->1:
-;#
-;#      vmrghh  V( x),          V( y), V( y + (1<<s))
-;#      vmrglh  V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;#   =0=   =1=   2->3  3->(4+d)  (4+s)->2:
-;#
-;#      vmrghw  V( x),          V( y), V( y + (1<<s))
-;#      vmrglw  V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;#   Unfortunately, there is no doubleword merge instruction.
-;#   The following sequence uses "vperm" is a substitute.
-;#   Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)
-;#   are in registers Vhihi and Vlolo, we can also effect the permutation
-;#
-;#   =0=   =1=   =2=   3->(4+d)  (4+s)->3   by the sequence:
-;#
-;#      vperm   V( x),          V( y), V( y + (1<<s)), Vhihi
-;#      vperm   V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo
-;#
-;#
-;#   Except for bits s and d, the other relationships between register
-;#   number (= high-order part of address) bits are at the disposal of
-;#   the programmer.
-;#
-
-;# To avoid excess transposes, we filter all 3 vertical luma subblock
-;#   edges together.  This requires a single 16x16 transpose, which, in
-;#   the above language, amounts to the following permutation of address
-;#   indices:  0<->4   1<->5  2<->6  3<->7, which we accomplish by
-;#   4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.
-;#
-;#   Except for the fact that the destination registers get written
-;#   before we are done referencing the old contents, the cyclic transform
-;#   is effected by
-;#
-;#      x = 0;  do {
-;#          vmrghb V(2x),   V(x), V(x+8);
-;#          vmrghb V(2x+1), V(x), V(x+8);
-;#      } while( ++x < 8);
-;#
-;#   For clarity, and because we can afford it, we do this transpose
-;#   using all 32 registers, alternating the banks 0..15  and  16 .. 31,
-;#   leaving the final result in 16 .. 31, as the lower registers are
-;#   used in the filtering itself.
-;#
-.macro Tpair A, B, X, Y
-    vmrghb  \A, \X, \Y
-    vmrglb  \B, \X, \Y
-.endm
-
-;# Each step takes 8*2 = 16 instructions
-
-.macro t16_even
-    Tpair v16,v17,  v0,v8
-    Tpair v18,v19,  v1,v9
-    Tpair v20,v21,  v2,v10
-    Tpair v22,v23,  v3,v11
-    Tpair v24,v25,  v4,v12
-    Tpair v26,v27,  v5,v13
-    Tpair v28,v29,  v6,v14
-    Tpair v30,v31,  v7,v15
-.endm
-
-.macro t16_odd
-    Tpair v0,v1, v16,v24
-    Tpair v2,v3, v17,v25
-    Tpair v4,v5, v18,v26
-    Tpair v6,v7, v19,v27
-    Tpair v8,v9, v20,v28
-    Tpair v10,v11, v21,v29
-    Tpair v12,v13, v22,v30
-    Tpair v14,v15, v23,v31
-.endm
-
-;# Whole transpose takes 4*16 = 64 instructions
-
-.macro t16_full
-    t16_odd
-    t16_even
-    t16_odd
-    t16_even
-.endm
-
-;# Vertical edge filtering requires transposes.  For the simple filter,
-;#   we need to convert 16 rows of 4 pels each into 4 registers of 16 pels
-;#   each.  Writing 0 ... 63 for the pixel indices, the desired result is:
-;#
-;#  v0 =  0  1 ... 14 15
-;#  v1 = 16 17 ... 30 31
-;#  v2 = 32 33 ... 47 48
-;#  v3 = 49 50 ... 62 63
-;#
-;#  In frame-buffer memory, the layout is:
-;#
-;#     0  16  32  48
-;#     1  17  33  49
-;#     ...
-;#    15  31  47  63.
-;#
-;#  We begin by reading the data 32 bits at a time (using scalar operations)
-;#  into a temporary array, reading the rows of the array into vector registers,
-;#  with the following layout:
-;#
-;#  v0 =  0 16 32 48  4 20 36 52  8 24 40 56  12 28 44 60
-;#  v1 =  1 17 33 49  5 21 ...                      45 61
-;#  v2 =  2 18 ...                                  46 62
-;#  v3 =  3 19 ...                                  47 63
-;#
-;#  From the "address-bit" perspective discussed above, we simply need to
-;#  interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.
-;#  In other words, we transpose each of the four 4x4 submatrices.
-;#
-;#  This transformation is its own inverse, and we need to perform it
-;#  again before writing the pixels back into the frame buffer.
-;#
-;#  It acts in place on registers v0...v3, uses v4...v7 as temporaries,
-;#  and assumes that v14/v15 contain the b_hihi/b_lolo selectors
-;#  defined above.  We think of both groups of 4 registers as having
-;#  "addresses" {0,1,2,3} * 16.
-;#
-.macro Transpose4times4x4 Vlo, Vhi
-
-    ;# d=s=0        0->1  1->2  2->3  3->4  4->0  =5=
-
-    vmrghb  v4, v0, v1
-    vmrglb  v5, v0, v1
-    vmrghb  v6, v2, v3
-    vmrglb  v7, v2, v3
-
-    ;# d=0 s=1      =0=   1->2  2->3  3->4  4->5  5->1
-
-    vmrghh  v0, v4, v6
-    vmrglh  v1, v4, v6
-    vmrghh  v2, v5, v7
-    vmrglh  v3, v5, v7
-
-    ;# d=s=0        =0=   =1=   2->3  3->4  4->2  =5=
-
-    vmrghw  v4, v0, v1
-    vmrglw  v5, v0, v1
-    vmrghw  v6, v2, v3
-    vmrglw  v7, v2, v3
-
-    ;# d=0  s=1     =0=   =1=   =2=   3->4  4->5  5->3
-
-    vperm   v0, v4, v6, \Vlo
-    vperm   v1, v4, v6, \Vhi
-    vperm   v2, v5, v7, \Vlo
-    vperm   v3, v5, v7, \Vhi
-.endm
-;# end Transpose4times4x4
-
-
-;# Normal mb vertical edge filter transpose.
-;#
-;#   We read 8 columns of data, initially in the following pattern:
-;#
-;#  (0,0)  (1,0) ... (7,0)  (0,1)  (1,1) ... (7,1)
-;#  (0,2)  (1,2) ... (7,2)  (0,3)  (1,3) ... (7,3)
-;#  ...
-;#  (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)
-;#
-;#   and wish to convert to:
-;#
-;#  (0,0) ... (0,15)
-;#  (1,0) ... (1,15)
-;#  ...
-;#  (7,0) ... (7,15).
-;#
-;#  In "address bit" language, we wish to map
-;#
-;#  0->4  1->5  2->6  3->0  4->1  5->2  6->3, i.e., I -> (I+4) mod 7.
-;#
-;#  This can be accomplished by 4 iterations of the cyclic transform
-;#
-;#  I -> (I+1) mod 7;
-;#
-;#  each iteration can be realized by (d=0, s=2):
-;#
-;#  x = 0;  do  Tpair( V(2x),V(2x+1),  V(x),V(x+4))  while( ++x < 4);
-;#
-;#  The input/output is in registers v0...v7.  We use v10...v17 as mirrors;
-;#  preserving v8 = sign converter.
-;#
-;#  Inverse transpose is similar, except here I -> (I+3) mod 7 and the
-;#  result lands in the "mirror" registers v10...v17
-;#
-.macro t8x16_odd
-    Tpair v10, v11,  v0, v4
-    Tpair v12, v13,  v1, v5
-    Tpair v14, v15,  v2, v6
-    Tpair v16, v17,  v3, v7
-.endm
-
-.macro t8x16_even
-    Tpair v0, v1,  v10, v14
-    Tpair v2, v3,  v11, v15
-    Tpair v4, v5,  v12, v16
-    Tpair v6, v7,  v13, v17
-.endm
-
-.macro transpose8x16_fwd
-    t8x16_odd
-    t8x16_even
-    t8x16_odd
-    t8x16_even
-.endm
-
-.macro transpose8x16_inv
-    t8x16_odd
-    t8x16_even
-    t8x16_odd
-.endm
-
-.macro Transpose16x16
-    vmrghb  v0, v16, v24
-    vmrglb  v1, v16, v24
-    vmrghb  v2, v17, v25
-    vmrglb  v3, v17, v25
-    vmrghb  v4, v18, v26
-    vmrglb  v5, v18, v26
-    vmrghb  v6, v19, v27
-    vmrglb  v7, v19, v27
-    vmrghb  v8, v20, v28
-    vmrglb  v9, v20, v28
-    vmrghb  v10, v21, v29
-    vmrglb  v11, v21, v29
-    vmrghb  v12, v22, v30
-    vmrglb  v13, v22, v30
-    vmrghb  v14, v23, v31
-    vmrglb  v15, v23, v31
-    vmrghb  v16, v0, v8
-    vmrglb  v17, v0, v8
-    vmrghb  v18, v1, v9
-    vmrglb  v19, v1, v9
-    vmrghb  v20, v2, v10
-    vmrglb  v21, v2, v10
-    vmrghb  v22, v3, v11
-    vmrglb  v23, v3, v11
-    vmrghb  v24, v4, v12
-    vmrglb  v25, v4, v12
-    vmrghb  v26, v5, v13
-    vmrglb  v27, v5, v13
-    vmrghb  v28, v6, v14
-    vmrglb  v29, v6, v14
-    vmrghb  v30, v7, v15
-    vmrglb  v31, v7, v15
-    vmrghb  v0, v16, v24
-    vmrglb  v1, v16, v24
-    vmrghb  v2, v17, v25
-    vmrglb  v3, v17, v25
-    vmrghb  v4, v18, v26
-    vmrglb  v5, v18, v26
-    vmrghb  v6, v19, v27
-    vmrglb  v7, v19, v27
-    vmrghb  v8, v20, v28
-    vmrglb  v9, v20, v28
-    vmrghb  v10, v21, v29
-    vmrglb  v11, v21, v29
-    vmrghb  v12, v22, v30
-    vmrglb  v13, v22, v30
-    vmrghb  v14, v23, v31
-    vmrglb  v15, v23, v31
-    vmrghb  v16, v0, v8
-    vmrglb  v17, v0, v8
-    vmrghb  v18, v1, v9
-    vmrglb  v19, v1, v9
-    vmrghb  v20, v2, v10
-    vmrglb  v21, v2, v10
-    vmrghb  v22, v3, v11
-    vmrglb  v23, v3, v11
-    vmrghb  v24, v4, v12
-    vmrglb  v25, v4, v12
-    vmrghb  v26, v5, v13
-    vmrglb  v27, v5, v13
-    vmrghb  v28, v6, v14
-    vmrglb  v29, v6, v14
-    vmrghb  v30, v7, v15
-    vmrglb  v31, v7, v15
-.endm
-
-;# load_g loads a global vector (whose address is in the local variable Gptr)
-;#   into vector register Vreg.  Trashes r0
-.macro load_g Vreg, Gptr
-    lwz     r0, \Gptr
-    lvx     \Vreg, 0, r0
-.endm
-
-;# exploit the saturation here.  if the answer is negative
-;# it will be clamped to 0.  orring 0 with a positive
-;# number will be the positive number (abs)
-;# RES = abs( A-B), trashes TMP
-.macro Abs RES, TMP, A, B
-    vsububs \RES, \A, \B
-    vsububs \TMP, \B, \A
-    vor     \RES, \RES, \TMP
-.endm
-
-;# RES = Max( RES, abs( A-B)), trashes TMP
-.macro max_abs RES, TMP, A, B
-    vsububs \TMP, \A, \B
-    vmaxub  \RES, \RES, \TMP
-    vsububs \TMP, \B, \A
-    vmaxub  \RES, \RES, \TMP
-.endm
-
-.macro Masks
-    ;# build masks
-    ;# input is all 8 bit unsigned (0-255).  need to
-    ;# do abs(vala-valb) > limit.  but no need to compare each
-    ;# value to the limit.  find the max of the absolute differences
-    ;# and compare that to the limit.
-    ;# First hev
-    Abs     v14, v13, v2, v3    ;# |P1 - P0|
-    max_abs  v14, v13, v5, v4    ;# |Q1 - Q0|
-
-    vcmpgtub v10, v14, v10      ;# HEV = true if thresh exceeded
-
-    ;# Next limit
-    max_abs  v14, v13, v0, v1    ;# |P3 - P2|
-    max_abs  v14, v13, v1, v2    ;# |P2 - P1|
-    max_abs  v14, v13, v6, v5    ;# |Q2 - Q1|
-    max_abs  v14, v13, v7, v6    ;# |Q3 - Q2|
-
-    vcmpgtub v9, v14, v9        ;# R = true if limit exceeded
-
-    ;# flimit
-    Abs     v14, v13, v3, v4    ;# |P0 - Q0|
-
-    vcmpgtub v8, v14, v8        ;# X = true if flimit exceeded
-
-    vor     v8, v8, v9          ;# R = true if flimit or limit exceeded
-    ;# done building masks
-.endm
-
-.macro build_constants RFL, RLI, RTH, FL, LI, TH
-    ;# build constants
-    lvx     \FL, 0, \RFL        ;# flimit
-    lvx     \LI, 0, \RLI        ;# limit
-    lvx     \TH, 0, \RTH        ;# thresh
-
-    vspltisb v11, 8
-    vspltisb v12, 4
-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
-.endm
-
-.macro load_data_y
-    ;# setup strides/pointers to be able to access
-    ;# all of the data
-    add     r5, r4, r4          ;# r5 = 2 * stride
-    sub     r6, r3, r5          ;# r6 -> 2 rows back
-    neg     r7, r4              ;# r7 = -stride
-
-    ;# load 16 pixels worth of data to work on
-    sub     r0, r6, r5          ;# r0 -> 4 rows back (temp)
-    lvx     v0,  0, r0          ;# P3  (read only)
-    lvx     v1, r7, r6          ;# P2
-    lvx     v2,  0, r6          ;# P1
-    lvx     v3, r7, r3          ;# P0
-    lvx     v4,  0, r3          ;# Q0
-    lvx     v5, r4, r3          ;# Q1
-    lvx     v6, r5, r3          ;# Q2
-    add     r0, r3, r5          ;# r0 -> 2 rows fwd (temp)
-    lvx     v7, r4, r0          ;# Q3  (read only)
-.endm
-
-;# Expects
-;#  v10 == HEV
-;#  v13 == tmp
-;#  v14 == tmp
-.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT
-    vxor    \P1, \P1, v11       ;# SP1
-    vxor    \P0, \P0, v11       ;# SP0
-    vxor    \Q0, \Q0, v11       ;# SQ0
-    vxor    \Q1, \Q1, v11       ;# SQ1
-
-    vsubsbs v13, \P1, \Q1       ;# f  = c (P1 - Q1)
-.if \HEV_PRESENT
-    vand    v13, v13, v10       ;# f &= hev
-.endif
-    vsubsbs v14, \Q0, \P0       ;# -126 <=  X = Q0-P0  <= +126
-    vaddsbs v13, v13, v14
-    vaddsbs v13, v13, v14
-    vaddsbs v13, v13, v14       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-
-    vandc   v13, v13, v8        ;# f &= mask
-
-    vspltisb v8, 3
-    vspltisb v9, 4
-
-    vaddsbs v14, v13, v9        ;# f1 = c (f+4)
-    vaddsbs v15, v13, v8        ;# f2 = c (f+3)
-
-    vsrab   v13, v14, v8        ;# f1 >>= 3
-    vsrab   v15, v15, v8        ;# f2 >>= 3
-
-    vsubsbs \Q0, \Q0, v13       ;# u1 = c (SQ0 - f1)
-    vaddsbs \P0, \P0, v15       ;# u2 = c (SP0 + f2)
-.endm
-
-.macro vp8_mbfilter
-    Masks
-
-    ;# start the fitering here
-    vxor    v1, v1, v11         ;# SP2
-    vxor    v2, v2, v11         ;# SP1
-    vxor    v3, v3, v11         ;# SP0
-    vxor    v4, v4, v11         ;# SQ0
-    vxor    v5, v5, v11         ;# SQ1
-    vxor    v6, v6, v11         ;# SQ2
-
-    ;# add outer taps if we have high edge variance
-    vsubsbs v13, v2, v5         ;# f  = c (SP1-SQ1)
-
-    vsubsbs v14, v4, v3         ;# SQ0-SP0
-    vaddsbs v13, v13, v14
-    vaddsbs v13, v13, v14
-    vaddsbs v13, v13, v14       ;# f  = c( c(SP1-SQ1) + 3*(SQ0-SP0))
-
-    vandc   v13, v13, v8        ;# f &= mask
-    vand    v15, v13, v10       ;# f2 = f & hev
-
-    ;# save bottom 3 bits so that we round one side +4 and the other +3
-    vspltisb v8, 3
-    vspltisb v9, 4
-
-    vaddsbs v14, v15, v9        ;# f1 = c (f+4)
-    vaddsbs v15, v15, v8        ;# f2 = c (f+3)
-
-    vsrab   v14, v14, v8        ;# f1 >>= 3
-    vsrab   v15, v15, v8        ;# f2 >>= 3
-
-    vsubsbs v4, v4, v14         ;# u1 = c (SQ0 - f1)
-    vaddsbs v3, v3, v15         ;# u2 = c (SP0 + f2)
-
-    ;# only apply wider filter if not high edge variance
-    vandc   v13, v13, v10       ;# f &= ~hev
-
-    vspltisb v9, 2
-    vnor    v8, v8, v8
-    vsrb    v9, v8, v9          ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f
-    vupkhsb v9, v9              ;# 0x003f003f003f003f003f003f003f003f
-    vspltisb v8, 9
-
-    ;# roughly 1/7th difference across boundary
-    vspltish v10, 7
-    vmulosb v14, v8, v13        ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-    vmulesb v15, v8, v13
-    vaddshs v14, v14, v9        ;# +=  63
-    vaddshs v15, v15, v9
-    vsrah   v14, v14, v10       ;# >>= 7
-    vsrah   v15, v15, v10
-    vmrglh  v10, v15, v14
-    vmrghh  v15, v15, v14
-
-    vpkshss v10, v15, v10       ;# X = saturated down to bytes
-
-    vsubsbs v6, v6, v10         ;# subtract from Q and add to P
-    vaddsbs v1, v1, v10
-
-    vxor    v6, v6, v11
-    vxor    v1, v1, v11
-
-    ;# roughly 2/7th difference across boundary
-    vspltish v10, 7
-    vaddubm v12, v8, v8
-    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-    vmulesb v15, v12, v13
-    vaddshs v14, v14, v9
-    vaddshs v15, v15, v9
-    vsrah   v14, v14, v10       ;# >>= 7
-    vsrah   v15, v15, v10
-    vmrglh  v10, v15, v14
-    vmrghh  v15, v15, v14
-
-    vpkshss v10, v15, v10       ;# X = saturated down to bytes
-
-    vsubsbs v5, v5, v10         ;# subtract from Q and add to P
-    vaddsbs v2, v2, v10
-
-    vxor    v5, v5, v11
-    vxor    v2, v2, v11
-
-    ;# roughly 3/7th difference across boundary
-    vspltish v10, 7
-    vaddubm v12, v12, v8
-    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-    vmulesb v15, v12, v13
-    vaddshs v14, v14, v9
-    vaddshs v15, v15, v9
-    vsrah   v14, v14, v10       ;# >>= 7
-    vsrah   v15, v15, v10
-    vmrglh  v10, v15, v14
-    vmrghh  v15, v15, v14
-
-    vpkshss v10, v15, v10       ;# X = saturated down to bytes
-
-    vsubsbs v4, v4, v10         ;# subtract from Q and add to P
-    vaddsbs v3, v3, v10
-
-    vxor    v4, v4, v11
-    vxor    v3, v3, v11
-.endm
-
-.macro SBFilter
-    Masks
-
-    common_adjust v3, v4, v2, v5, 1
-
-    ;# outer tap adjustments
-    vspltisb v8, 1
-
-    vaddubm v13, v13, v8        ;# f  += 1
-    vsrab   v13, v13, v8        ;# f >>= 1
-
-    vandc   v13, v13, v10       ;# f &= ~hev
-
-    vsubsbs v5, v5, v13         ;# u1 = c (SQ1 - f)
-    vaddsbs v2, v2, v13         ;# u2 = c (SP1 + f)
-
-    vxor    v2, v2, v11
-    vxor    v3, v3, v11
-    vxor    v4, v4, v11
-    vxor    v5, v5, v11
-.endm
-
-    .align 2
-mbloop_filter_horizontal_edge_y_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    build_constants r5, r6, r7, v8, v9, v10
-
-    load_data_y
-
-    vp8_mbfilter
-
-    stvx     v1, r7, r6         ;# P2
-    stvx     v2,  0, r6         ;# P1
-    stvx     v3, r7, r3         ;# P0
-    stvx     v4,  0, r3         ;# Q0
-    stvx     v5, r4, r3         ;# Q1
-    stvx     v6, r5, r3         ;# Q2
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-;#  r6 const signed char *limit
-;#  r7 const signed char *thresh
-loop_filter_horizontal_edge_y_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    build_constants r5, r6, r7, v8, v9, v10
-
-    load_data_y
-
-    SBFilter
-
-    stvx     v2,  0, r6         ;# P1
-    stvx     v3, r7, r3         ;# P0
-    stvx     v4,  0, r3         ;# Q0
-    stvx     v5, r4, r3         ;# Q1
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# Filtering a vertical mb.  Each mb is aligned on a 16 byte boundary.
-;#  So we can read in an entire mb aligned.  However if we want to filter the mb
-;#  edge we run into problems.  For the loopfilter we require 4 bytes before the mb
-;#  and 4 after for a total of 8 bytes.  Reading 16 bytes inorder to get 4 is a bit
-;#  of a waste.  So this is an even uglier way to get around that.
-;# Using the regular register file words are read in and then saved back out to
-;#  memory to align and order them up.  Then they are read in using the
-;#  vector register file.
-.macro RLVmb V, R
-    lwzux   r0, r3, r4
-    stw     r0, 4(\R)
-    lwz     r0,-4(r3)
-    stw     r0, 0(\R)
-    lwzux   r0, r3, r4
-    stw     r0,12(\R)
-    lwz     r0,-4(r3)
-    stw     r0, 8(\R)
-    lvx     \V, 0, \R
-.endm
-
-.macro WLVmb V, R
-    stvx    \V, 0, \R
-    lwz     r0,12(\R)
-    stwux   r0, r3, r4
-    lwz     r0, 8(\R)
-    stw     r0,-4(r3)
-    lwz     r0, 4(\R)
-    stwux   r0, r3, r4
-    lwz     r0, 0(\R)
-    stw     r0,-4(r3)
-.endm
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-;#  r6 const signed char *limit
-;#  r7 const signed char *thresh
-mbloop_filter_vertical_edge_y_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xc000
-    mtspr   256, r12            ;# set VRSAVE
-
-    la      r9, -48(r1)         ;# temporary space for reading in vectors
-    sub     r3, r3, r4
-
-    RLVmb v0, r9
-    RLVmb v1, r9
-    RLVmb v2, r9
-    RLVmb v3, r9
-    RLVmb v4, r9
-    RLVmb v5, r9
-    RLVmb v6, r9
-    RLVmb v7, r9
-
-    transpose8x16_fwd
-
-    build_constants r5, r6, r7, v8, v9, v10
-
-    vp8_mbfilter
-
-    transpose8x16_inv
-
-    add r3, r3, r4
-    neg r4, r4
-
-    WLVmb v17, r9
-    WLVmb v16, r9
-    WLVmb v15, r9
-    WLVmb v14, r9
-    WLVmb v13, r9
-    WLVmb v12, r9
-    WLVmb v11, r9
-    WLVmb v10, r9
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-.macro RL V, R, P
-    lvx     \V, 0,  \R
-    add     \R, \R, \P
-.endm
-
-.macro WL V, R, P
-    stvx    \V, 0,  \R
-    add     \R, \R, \P
-.endm
-
-.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3
-                                ;# K = |P0-P1| already
-    Abs     v14, v13, \Q0, \Q1  ;# M = |Q0-Q1|
-    vmaxub  v14, v14, v4        ;# M = max( |P0-P1|, |Q0-Q1|)
-    vcmpgtub v10, v14, v0
-
-    Abs     v4, v5, \Q2, \Q3    ;# K = |Q2-Q3| = next |P0-P1]
-
-    max_abs  v14, v13, \Q1, \Q2  ;# M = max( M, |Q1-Q2|)
-    max_abs  v14, v13, \P1, \P2  ;# M = max( M, |P1-P2|)
-    max_abs  v14, v13, \P2, \P3  ;# M = max( M, |P2-P3|)
-
-    vmaxub   v14, v14, v4       ;# M = max interior abs diff
-    vcmpgtub v9, v14, v2        ;# M = true if int_l exceeded
-
-    Abs     v14, v13, \P0, \Q0  ;# X = Abs( P0-Q0)
-    vcmpgtub v8, v14, v3        ;# X = true if edge_l exceeded
-    vor     v8, v8, v9          ;# M = true if edge_l or int_l exceeded
-
-    ;# replace P1,Q1 w/signed versions
-    common_adjust \P0, \Q0, \P1, \Q1, 1
-
-    vaddubm v13, v13, v1        ;# -16 <= M <= 15, saturation irrelevant
-    vsrab   v13, v13, v1
-    vandc   v13, v13, v10       ;# adjust P1,Q1 by (M+1)>>1  if ! hev
-    vsubsbs \Q1, \Q1, v13
-    vaddsbs \P1, \P1, v13
-
-    vxor    \P1, \P1, v11       ;# P1
-    vxor    \P0, \P0, v11       ;# P0
-    vxor    \Q0, \Q0, v11       ;# Q0
-    vxor    \Q1, \Q1, v11       ;# Q1
-.endm
-
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-;#  r6 const signed char *limit
-;#  r7 const signed char *thresh
-loop_filter_vertical_edge_y_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    addi    r9, r3, 0
-    RL      v16, r9, r4
-    RL      v17, r9, r4
-    RL      v18, r9, r4
-    RL      v19, r9, r4
-    RL      v20, r9, r4
-    RL      v21, r9, r4
-    RL      v22, r9, r4
-    RL      v23, r9, r4
-    RL      v24, r9, r4
-    RL      v25, r9, r4
-    RL      v26, r9, r4
-    RL      v27, r9, r4
-    RL      v28, r9, r4
-    RL      v29, r9, r4
-    RL      v30, r9, r4
-    lvx     v31, 0, r9
-
-    Transpose16x16
-
-    vspltisb v1, 1
-
-    build_constants r5, r6, r7, v3, v2, v0
-
-    Abs v4, v5, v19, v18                            ;# K(v14) = first |P0-P1|
-
-    Fil v16, v17, v18, v19,  v20, v21, v22, v23
-    Fil v20, v21, v22, v23,  v24, v25, v26, v27
-    Fil v24, v25, v26, v27,  v28, v29, v30, v31
-
-    Transpose16x16
-
-    addi    r9, r3, 0
-    WL      v16, r9, r4
-    WL      v17, r9, r4
-    WL      v18, r9, r4
-    WL      v19, r9, r4
-    WL      v20, r9, r4
-    WL      v21, r9, r4
-    WL      v22, r9, r4
-    WL      v23, r9, r4
-    WL      v24, r9, r4
-    WL      v25, r9, r4
-    WL      v26, r9, r4
-    WL      v27, r9, r4
-    WL      v28, r9, r4
-    WL      v29, r9, r4
-    WL      v30, r9, r4
-    stvx    v31, 0, r9
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-.macro active_chroma_sel V
-    andi.   r7, r3, 8       ;# row origin modulo 16
-    add     r7, r7, r7      ;# selects selectors
-    lis     r12, _chromaSelectors@ha
-    la      r0,  _chromaSelectors@l(r12)
-    lwzux   r0, r7, r0      ;# leave selector addr in r7
-
-    lvx     \V, 0, r0       ;# mask to concatenate active U,V pels
-.endm
-
-.macro hread_uv Dest, U, V, Offs, VMask
-    lvx     \U, \Offs, r3
-    lvx     \V, \Offs, r4
-    vperm   \Dest, \U, \V, \VMask   ;# Dest = active part of U then V
-.endm
-
-.macro hwrite_uv New, U, V, Offs, Umask, Vmask
-    vperm   \U, \New, \U, \Umask    ;# Combine new pels with siblings
-    vperm   \V, \New, \V, \Vmask
-    stvx    \U, \Offs, r3           ;# Write to frame buffer
-    stvx    \V, \Offs, r4
-.endm
-
-;# Process U,V in parallel.
-.macro load_chroma_h
-    neg     r9, r5          ;# r9 = -1 * stride
-    add     r8, r9, r9      ;# r8 = -2 * stride
-    add     r10, r5, r5     ;# r10 = 2 * stride
-
-    active_chroma_sel v12
-
-    ;# P3, Q3 are read-only; need not save addresses or sibling pels
-    add     r6, r8, r8      ;# r6 = -4 * stride
-    hread_uv v0, v14, v15, r6, v12
-    add     r6, r10, r5     ;# r6 =  3 * stride
-    hread_uv v7, v14, v15, r6, v12
-
-    ;# Others are read/write; save addresses and sibling pels
-
-    add     r6, r8, r9      ;# r6 = -3 * stride
-    hread_uv v1, v16, v17, r6,  v12
-    hread_uv v2, v18, v19, r8,  v12
-    hread_uv v3, v20, v21, r9,  v12
-    hread_uv v4, v22, v23, 0,   v12
-    hread_uv v5, v24, v25, r5,  v12
-    hread_uv v6, v26, v27, r10, v12
-.endm
-
-.macro uresult_sel V
-    load_g   \V, 4(r7)
-.endm
-
-.macro vresult_sel V
-    load_g   \V, 8(r7)
-.endm
-
-;# always write P1,P0,Q0,Q1
-.macro store_chroma_h
-    uresult_sel v11
-    vresult_sel v12
-    hwrite_uv v2, v18, v19, r8, v11, v12
-    hwrite_uv v3, v20, v21, r9, v11, v12
-    hwrite_uv v4, v22, v23, 0,  v11, v12
-    hwrite_uv v5, v24, v25, r5, v11, v12
-.endm
-
-    .align 2
-;#  r3 unsigned char *u
-;#  r4 unsigned char *v
-;#  r5 int p
-;#  r6 const signed char *flimit
-;#  r7 const signed char *limit
-;#  r8 const signed char *thresh
-mbloop_filter_horizontal_edge_uv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    build_constants r6, r7, r8, v8, v9, v10
-
-    load_chroma_h
-
-    vp8_mbfilter
-
-    store_chroma_h
-
-    hwrite_uv v1, v16, v17, r6,  v11, v12    ;# v1 == P2
-    hwrite_uv v6, v26, v27, r10, v11, v12    ;# v6 == Q2
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;#  r3 unsigned char *u
-;#  r4 unsigned char *v
-;#  r5 int p
-;#  r6 const signed char *flimit
-;#  r7 const signed char *limit
-;#  r8 const signed char *thresh
-loop_filter_horizontal_edge_uv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    build_constants r6, r7, r8, v8, v9, v10
-
-    load_chroma_h
-
-    SBFilter
-
-    store_chroma_h
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-.macro R V, R
-    lwzux   r0, r3, r5
-    stw     r0, 4(\R)
-    lwz     r0,-4(r3)
-    stw     r0, 0(\R)
-    lwzux   r0, r4, r5
-    stw     r0,12(\R)
-    lwz     r0,-4(r4)
-    stw     r0, 8(\R)
-    lvx     \V, 0, \R
-.endm
-
-
-.macro W V, R
-    stvx    \V, 0, \R
-    lwz     r0,12(\R)
-    stwux   r0, r4, r5
-    lwz     r0, 8(\R)
-    stw     r0,-4(r4)
-    lwz     r0, 4(\R)
-    stwux   r0, r3, r5
-    lwz     r0, 0(\R)
-    stw     r0,-4(r3)
-.endm
-
-.macro chroma_vread R
-    sub r3, r3, r5          ;# back up one line for simplicity
-    sub r4, r4, r5
-
-    R v0, \R
-    R v1, \R
-    R v2, \R
-    R v3, \R
-    R v4, \R
-    R v5, \R
-    R v6, \R
-    R v7, \R
-
-    transpose8x16_fwd
-.endm
-
-.macro chroma_vwrite R
-
-    transpose8x16_inv
-
-    add     r3, r3, r5
-    add     r4, r4, r5
-    neg     r5, r5          ;# Write rows back in reverse order
-
-    W v17, \R
-    W v16, \R
-    W v15, \R
-    W v14, \R
-    W v13, \R
-    W v12, \R
-    W v11, \R
-    W v10, \R
-.endm
-
-    .align 2
-;#  r3 unsigned char *u
-;#  r4 unsigned char *v
-;#  r5 int p
-;#  r6 const signed char *flimit
-;#  r7 const signed char *limit
-;#  r8 const signed char *thresh
-mbloop_filter_vertical_edge_uv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xc000
-    mtspr   256, r12            ;# set VRSAVE
-
-    la      r9, -48(r1)         ;# temporary space for reading in vectors
-
-    chroma_vread r9
-
-    build_constants r6, r7, r8, v8, v9, v10
-
-    vp8_mbfilter
-
-    chroma_vwrite r9
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;#  r3 unsigned char *u
-;#  r4 unsigned char *v
-;#  r5 int p
-;#  r6 const signed char *flimit
-;#  r7 const signed char *limit
-;#  r8 const signed char *thresh
-loop_filter_vertical_edge_uv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xc000
-    mtspr   256, r12            ;# set VRSAVE
-
-    la      r9, -48(r1)         ;# temporary space for reading in vectors
-
-    chroma_vread r9
-
-    build_constants r6, r7, r8, v8, v9, v10
-
-    SBFilter
-
-    chroma_vwrite r9
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-
-
-.macro vp8_simple_filter
-    Abs v14, v13, v1, v2    ;# M = abs( P0 - Q0)
-    vcmpgtub v8, v14, v8    ;# v5 = true if _over_ limit
-
-    ;# preserve unsigned v0 and v3
-    common_adjust v1, v2, v0, v3, 0
-
-    vxor v1, v1, v11
-    vxor v2, v2, v11        ;# cvt Q0, P0 back to pels
-.endm
-
-.macro simple_vertical
-    addi    r8,  0, 16
-    addi    r7, r5, 32
-
-    lvx     v0,  0, r5
-    lvx     v1, r8, r5
-    lvx     v2,  0, r7
-    lvx     v3, r8, r7
-
-    lis     r12, _B_hihi@ha
-    la      r0,  _B_hihi@l(r12)
-    lvx     v16, 0, r0
-
-    lis     r12, _B_lolo@ha
-    la      r0,  _B_lolo@l(r12)
-    lvx     v17, 0, r0
-
-    Transpose4times4x4 v16, v17
-    vp8_simple_filter
-
-    vxor v0, v0, v11
-    vxor v3, v3, v11        ;# cvt Q0, P0 back to pels
-
-    Transpose4times4x4 v16, v17
-
-    stvx    v0,  0, r5
-    stvx    v1, r8, r5
-    stvx    v2,  0, r7
-    stvx    v3, r8, r7
-.endm
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-loop_filter_simple_horizontal_edge_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    ;# build constants
-    lvx     v8, 0, r5           ;# flimit
-
-    vspltisb v11, 8
-    vspltisb v12, 4
-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
-
-    neg     r5, r4              ;# r5 = -1 * stride
-    add     r6, r5, r5          ;# r6 = -2 * stride
-
-    lvx     v0, r6, r3          ;# v0 = P1 = 16 pels two rows above edge
-    lvx     v1, r5, r3          ;# v1 = P0 = 16 pels one row  above edge
-    lvx     v2,  0, r3          ;# v2 = Q0 = 16 pels one row  below edge
-    lvx     v3, r4, r3          ;# v3 = Q1 = 16 pels two rows below edge
-
-    vp8_simple_filter
-
-    stvx    v1, r5, r3          ;# store P0
-    stvx    v2,  0, r3          ;# store Q0
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-.macro RLV Offs
-    stw     r0, (\Offs*4)(r5)
-    lwzux   r0, r7, r4
-.endm
-
-.macro WLV Offs
-    lwz     r0, (\Offs*4)(r5)
-    stwux   r0, r7, r4
-.endm
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-loop_filter_simple_vertical_edge_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xc000
-    mtspr   256, r12            ;# set VRSAVE
-
-    ;# build constants
-    lvx     v8, 0, r5           ;# flimit
-
-    vspltisb v11, 8
-    vspltisb v12, 4
-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
-
-    la r5, -96(r1)              ;# temporary space for reading in vectors
-
-    ;# Store 4 pels at word "Offs" in temp array, then advance r7
-    ;#   to next row and read another 4 pels from the frame buffer.
-
-    subi    r7, r3,  2          ;# r7 -> 2 pels before start
-    lwzx    r0,  0, r7          ;# read first 4 pels
-
-    ;# 16 unaligned word accesses
-    RLV 0
-    RLV 4
-    RLV 8
-    RLV 12
-    RLV 1
-    RLV 5
-    RLV 9
-    RLV 13
-    RLV 2
-    RLV 6
-    RLV 10
-    RLV 14
-    RLV 3
-    RLV 7
-    RLV 11
-
-    stw     r0, (15*4)(r5)      ;# write last 4 pels
-
-    simple_vertical
-
-    ;# Read temp array, write frame buffer.
-    subi    r7, r3,  2          ;# r7 -> 2 pels before start
-    lwzx    r0,  0, r5          ;# read/write first 4 pels
-    stwx    r0,  0, r7
-
-    WLV 4
-    WLV 8
-    WLV 12
-    WLV 1
-    WLV 5
-    WLV 9
-    WLV 13
-    WLV 2
-    WLV 6
-    WLV 10
-    WLV 14
-    WLV 3
-    WLV 7
-    WLV 11
-    WLV 15
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-_chromaSelectors:
-    .long   _B_hihi
-    .long   _B_Ures0
-    .long   _B_Vres0
-    .long   0
-    .long   _B_lolo
-    .long   _B_Ures8
-    .long   _B_Vres8
-    .long   0
-
-    .align 4
-_B_Vres8:
-    .byte   16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15
-
-    .align 4
-_B_Ures8:
-    .byte   16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7
-
-    .align 4
-_B_lolo:
-    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
-
-    .align 4
-_B_Vres0:
-    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
-    .align 4
-_B_Ures0:
-    .byte    0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31
-
-    .align 4
-_B_hihi:
-    .byte    0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
--- a/vp9/common/ppc/vp9_platform_altivec.asm
+++ /dev/null
@@ -1,59 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl save_platform_context
-    .globl restore_platform_context
-
-.macro W V P
-    stvx    \V,  0, \P
-    addi    \P, \P, 16
-.endm
-
-.macro R V P
-    lvx     \V,  0, \P
-    addi    \P, \P, 16
-.endm
-
-;# r3 context_ptr
-    .align 2
-save_platform_contex:
-    W v20, r3
-    W v21, r3
-    W v22, r3
-    W v23, r3
-    W v24, r3
-    W v25, r3
-    W v26, r3
-    W v27, r3
-    W v28, r3
-    W v29, r3
-    W v30, r3
-    W v31, r3
-
-    blr
-
-;# r3 context_ptr
-    .align 2
-restore_platform_context:
-    R v20, r3
-    R v21, r3
-    R v22, r3
-    R v23, r3
-    R v24, r3
-    R v25, r3
-    R v26, r3
-    R v27, r3
-    R v28, r3
-    R v29, r3
-    R v30, r3
-    R v31, r3
-
-    blr
--- a/vp9/common/ppc/vp9_recon_altivec.asm
+++ /dev/null
@@ -1,175 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl recon4b_ppc
-    .globl recon2b_ppc
-    .globl recon_b_ppc
-
-.macro row_of16 Diff Pred Dst Stride
-    lvx     v1,  0, \Pred           ;# v1 = pred = p0..p15
-    addi    \Pred, \Pred, 16        ;# next pred
-    vmrghb  v2, v0, v1              ;# v2 = 16-bit p0..p7
-    lvx     v3,  0, \Diff           ;# v3 = d0..d7
-    vaddshs v2, v2, v3              ;# v2 = r0..r7
-    vmrglb  v1, v0, v1              ;# v1 = 16-bit p8..p15
-    lvx     v3, r8, \Diff           ;# v3 = d8..d15
-    addi    \Diff, \Diff, 32        ;# next diff
-    vaddshs v3, v3, v1              ;# v3 = r8..r15
-    vpkshus v2, v2, v3              ;# v2 = 8-bit r0..r15
-    stvx    v2,  0, \Dst            ;# to dst
-    add     \Dst, \Dst, \Stride     ;# next dst
-.endm
-
-    .text
-    .align 2
-;#  r3 = short *diff_ptr,
-;#  r4 = unsigned char *pred_ptr,
-;#  r5 = unsigned char *dst_ptr,
-;#  r6 = int stride
-recon4b_ppc:
-    mfspr   r0, 256                     ;# get old VRSAVE
-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
-    oris    r0, r0, 0xf000
-    mtspr   256,r0                      ;# set VRSAVE
-
-    vxor    v0, v0, v0
-    li      r8, 16
-
-    row_of16 r3, r4, r5, r6
-    row_of16 r3, r4, r5, r6
-    row_of16 r3, r4, r5, r6
-    row_of16 r3, r4, r5, r6
-
-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
-    mtspr   256, r12                    ;# reset old VRSAVE
-
-    blr
-
-.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels
-    lvx     v1,  0, \Pred       ;# v1 = pred = p0..p15
-    vmrghb  v2, v0, v1          ;# v2 = 16-bit p0..p7
-    lvx     v3,  0, \Diff       ;# v3 = d0..d7
-    vaddshs v2, v2, v3          ;# v2 = r0..r7
-    vmrglb  v1, v0, v1          ;# v1 = 16-bit p8..p15
-    lvx     v3, r8, \Diff       ;# v2 = d8..d15
-    vaddshs v3, v3, v1          ;# v3 = r8..r15
-    vpkshus v2, v2, v3          ;# v3 = 8-bit r0..r15
-    stvx    v2,  0, r10         ;# 2 rows to dst from buf
-    lwz     r0, 0(r10)
-.if \write_first_four_pels
-    stw     r0, 0(\Dst)
-    .else
-    stwux   r0, \Dst, \Stride
-.endif
-    lwz     r0, 4(r10)
-    stw     r0, 4(\Dst)
-    lwz     r0, 8(r10)
-    stwux   r0, \Dst, \Stride       ;# advance dst to next row
-    lwz     r0, 12(r10)
-    stw     r0, 4(\Dst)
-.endm
-
-    .align 2
-;#  r3 = short *diff_ptr,
-;#  r4 = unsigned char *pred_ptr,
-;#  r5 = unsigned char *dst_ptr,
-;#  r6 = int stride
-
-recon2b_ppc:
-    mfspr   r0, 256                     ;# get old VRSAVE
-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
-    oris    r0, r0, 0xf000
-    mtspr   256,r0                      ;# set VRSAVE
-
-    vxor    v0, v0, v0
-    li      r8, 16
-
-    la      r10, -48(r1)                ;# buf
-
-    two_rows_of8 r3, r4, r5, r6, 1
-
-    addi    r4, r4, 16;                 ;# next pred
-    addi    r3, r3, 32;                 ;# next diff
-
-    two_rows_of8 r3, r4, r5, r6, 0
-
-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
-    mtspr   256, r12                    ;# reset old VRSAVE
-
-    blr
-
-.macro get_two_diff_rows
-    stw     r0, 0(r10)
-    lwz     r0, 4(r3)
-    stw     r0, 4(r10)
-    lwzu    r0, 32(r3)
-    stw     r0, 8(r10)
-    lwz     r0, 4(r3)
-    stw     r0, 12(r10)
-    lvx     v3, 0, r10
-.endm
-
-    .align 2
-;#  r3 = short *diff_ptr,
-;#  r4 = unsigned char *pred_ptr,
-;#  r5 = unsigned char *dst_ptr,
-;#  r6 = int stride
-recon_b_ppc:
-    mfspr   r0, 256                     ;# get old VRSAVE
-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
-    oris    r0, r0, 0xf000
-    mtspr   256,r0                      ;# set VRSAVE
-
-    vxor    v0, v0, v0
-
-    la      r10, -48(r1)    ;# buf
-
-    lwz     r0, 0(r4)
-    stw     r0, 0(r10)
-    lwz     r0, 16(r4)
-    stw     r0, 4(r10)
-    lwz     r0, 32(r4)
-    stw     r0, 8(r10)
-    lwz     r0, 48(r4)
-    stw     r0, 12(r10)
-
-    lvx     v1,  0, r10;    ;# v1 = pred = p0..p15
-
-    lwz r0, 0(r3)           ;# v3 = d0..d7
-
-    get_two_diff_rows
-
-    vmrghb  v2, v0, v1;     ;# v2 = 16-bit p0..p7
-    vaddshs v2, v2, v3;     ;# v2 = r0..r7
-
-    lwzu r0, 32(r3)         ;# v3 = d8..d15
-
-    get_two_diff_rows
-
-    vmrglb  v1, v0, v1;     ;# v1 = 16-bit p8..p15
-    vaddshs v3, v3, v1;     ;# v3 = r8..r15
-
-    vpkshus v2, v2, v3;     ;# v2 = 8-bit r0..r15
-    stvx    v2,  0, r10;    ;# 16 pels to dst from buf
-
-    lwz     r0, 0(r10)
-    stw     r0, 0(r5)
-    lwz     r0, 4(r10)
-    stwux   r0, r5, r6
-    lwz     r0, 8(r10)
-    stwux   r0, r5, r6
-    lwz     r0, 12(r10)
-    stwx    r0, r5, r6
-
-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
-    mtspr   256, r12                    ;# reset old VRSAVE
-
-    blr
--- a/vp9/common/ppc/vp9_systemdependent.c
+++ /dev/null
@@ -1,167 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp9/common/vp9_loopfilter.h"
-#include "recon.h"
-#include "vp9/common/vp9_onyxc_int.h"
-
-void (*vp8_short_idct4x4)(short *input, short *output, int pitch);
-void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);
-void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);
-
-extern void (*vp9_post_proc_down_and_across)(unsigned char *src_ptr,
-                                             unsigned char *dst_ptr,
-                                             int src_pixels_per_line,
-                                             int dst_pixels_per_line,
-                                             int rows, int cols, int flimit);
-
-extern void (*vp9_mbpost_proc_down)(unsigned char *dst, int pitch,
-                                    int rows, int cols, int flimit);
-extern void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch,
-                                   int rows, int cols, int flimit);
-extern void (*vp9_mbpost_proc_across_ip)(unsigned char *src, int pitch,
-                                         int rows, int cols, int flimit);
-extern void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch,
-                                        int rows, int cols, int flimit);
-extern void vp9_post_proc_down_and_across_c(unsigned char *src_ptr,
-                                            unsigned char *dst_ptr,
-                                            int src_pixels_per_line,
-                                            int dst_pixels_per_line,
-                                            int rows, int cols, int flimit);
-void vp9_plane_add_noise_c(unsigned char *start,
-                           unsigned int width, unsigned int height,
-                           int pitch, int q, int a);
-
-extern copy_mem_block_function *vp9_copy_mem16x16;
-extern copy_mem_block_function *vp9_copy_mem8x8;
-extern copy_mem_block_function *vp9_copy_mem8x4;
-
-// PPC
-extern subpixel_predict_function sixtap_predict_ppc;
-extern subpixel_predict_function sixtap_predict8x4_ppc;
-extern subpixel_predict_function sixtap_predict8x8_ppc;
-extern subpixel_predict_function sixtap_predict16x16_ppc;
-extern subpixel_predict_function bilinear_predict4x4_ppc;
-extern subpixel_predict_function bilinear_predict8x4_ppc;
-extern subpixel_predict_function bilinear_predict8x8_ppc;
-extern subpixel_predict_function bilinear_predict16x16_ppc;
-
-extern copy_mem_block_function copy_mem16x16_ppc;
-
-void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr,
-                 unsigned char *dst_ptr, int stride);
-void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr,
-                 unsigned char *dst_ptr, int stride);
-void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr,
-                 unsigned char *dst_ptr, int stride);
-
-extern void short_idct4x4_ppc(short *input, short *output, int pitch);
-
-// Generic C
-extern subpixel_predict_function vp9_sixtap_predict_c;
-extern subpixel_predict_function vp9_sixtap_predict8x4_c;
-extern subpixel_predict_function vp9_sixtap_predict8x8_c;
-extern subpixel_predict_function vp9_sixtap_predict16x16_c;
-extern subpixel_predict_function vp9_bilinear_predict4x4_c;
-extern subpixel_predict_function vp9_bilinear_predict8x4_c;
-extern subpixel_predict_function vp9_bilinear_predict8x8_c;
-extern subpixel_predict_function vp9_bilinear_predict16x16_c;
-
-extern copy_mem_block_function vp9_copy_mem16x16_c;
-extern copy_mem_block_function vp9_copy_mem8x8_c;
-extern copy_mem_block_function vp9_copy_mem8x4_c;
-
-void vp9_recon_b_c(short *diff_ptr, unsigned char *pred_ptr,
-                   unsigned char *dst_ptr, int stride);
-void vp9_recon2b_c(short *diff_ptr, unsigned char *pred_ptr,
-                   unsigned char *dst_ptr, int stride);
-void vp9_recon4b_c(short *diff_ptr, unsigned char *pred_ptr,
-                   unsigned char *dst_ptr, int stride);
-
-extern void vp9_short_idct4x4_1_c(short *input, short *output, int pitch);
-extern void vp9_short_idct4x4_c(short *input, short *output, int pitch);
-extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
-
-// PPC
-extern loop_filter_block_function loop_filter_mbv_ppc;
-extern loop_filter_block_function loop_filter_bv_ppc;
-extern loop_filter_block_function loop_filter_mbh_ppc;
-extern loop_filter_block_function loop_filter_bh_ppc;
-
-extern loop_filter_block_function loop_filter_mbvs_ppc;
-extern loop_filter_block_function loop_filter_bvs_ppc;
-extern loop_filter_block_function loop_filter_mbhs_ppc;
-extern loop_filter_block_function loop_filter_bhs_ppc;
-
-// Generic C
-extern loop_filter_block_function vp9_loop_filter_mbv_c;
-extern loop_filter_block_function vp9_loop_filter_bv_c;
-extern loop_filter_block_function vp9_loop_filter_mbh_c;
-extern loop_filter_block_function vp9_loop_filter_bh_c;
-
-extern loop_filter_block_function vp9_loop_filter_mbvs_c;
-extern loop_filter_block_function vp9_loop_filter_bvs_c;
-extern loop_filter_block_function vp9_loop_filter_mbhs_c;
-extern loop_filter_block_function vp9_loop_filter_bhs_c;
-
-extern loop_filter_block_function *vp8_lf_mbvfull;
-extern loop_filter_block_function *vp8_lf_mbhfull;
-extern loop_filter_block_function *vp8_lf_bvfull;
-extern loop_filter_block_function *vp8_lf_bhfull;
-
-extern loop_filter_block_function *vp8_lf_mbvsimple;
-extern loop_filter_block_function *vp8_lf_mbhsimple;
-extern loop_filter_block_function *vp8_lf_bvsimple;
-extern loop_filter_block_function *vp8_lf_bhsimple;
-
-void vp9_clear_c(void) {
-}
-
-void vp9_machine_specific_config(void) {
-  // Pure C:
-  vp9_clear_system_state                = vp9_clear_c;
-  vp9_recon_b                          = vp9_recon_b_c;
-  vp9_recon4b                         = vp9_recon4b_c;
-  vp9_recon2b                         = vp9_recon2b_c;
-
-  vp9_bilinear_predict16x16            = bilinear_predict16x16_ppc;
-  vp9_bilinear_predict8x8              = bilinear_predict8x8_ppc;
-  vp9_bilinear_predict8x4              = bilinear_predict8x4_ppc;
-  vp8_bilinear_predict                 = bilinear_predict4x4_ppc;
-
-  vp9_sixtap_predict16x16              = sixtap_predict16x16_ppc;
-  vp9_sixtap_predict8x8                = sixtap_predict8x8_ppc;
-  vp9_sixtap_predict8x4                = sixtap_predict8x4_ppc;
-  vp9_sixtap_predict                   = sixtap_predict_ppc;
-
-  vp8_short_idct4x4_1                  = vp9_short_idct4x4_1_c;
-  vp8_short_idct4x4                    = short_idct4x4_ppc;
-  vp8_dc_only_idct                      = vp8_dc_only_idct_c;
-
-  vp8_lf_mbvfull                       = loop_filter_mbv_ppc;
-  vp8_lf_bvfull                        = loop_filter_bv_ppc;
-  vp8_lf_mbhfull                       = loop_filter_mbh_ppc;
-  vp8_lf_bhfull                        = loop_filter_bh_ppc;
-
-  vp8_lf_mbvsimple                     = loop_filter_mbvs_ppc;
-  vp8_lf_bvsimple                      = loop_filter_bvs_ppc;
-  vp8_lf_mbhsimple                     = loop_filter_mbhs_ppc;
-  vp8_lf_bhsimple                      = loop_filter_bhs_ppc;
-
-  vp9_post_proc_down_and_across           = vp9_post_proc_down_and_across_c;
-  vp9_mbpost_proc_down                  = vp9_mbpost_proc_down_c;
-  vp9_mbpost_proc_across_ip              = vp9_mbpost_proc_across_ip_c;
-  vp9_plane_add_noise                   = vp9_plane_add_noise_c;
-
-  vp9_copy_mem16x16                    = copy_mem16x16_ppc;
-  vp9_copy_mem8x8                      = vp9_copy_mem8x8_c;
-  vp9_copy_mem8x4                      = vp9_copy_mem8x4_c;
-
-}
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -10,84 +10,109 @@
 
 
 #include "./vpx_config.h"
-#include "vp9/common/vp9_blockd.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/vp9_findnearmv.h"
+#include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_findnearmv.h"
+#include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_systemdependent.h"
 
-
-void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base) {
-  int stride = cpi->mode_info_stride;
+void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi) {
+  const int stride = cm->mode_info_stride;
   int i;
 
   // Clear down top border row
-  vpx_memset(mi_base, 0, sizeof(MODE_INFO) * cpi->mode_info_stride);
+  vpx_memset(mi, 0, sizeof(MODE_INFO) * stride);
 
   // Clear left border column
-  for (i = 1; i < cpi->mb_rows + 1; i++) {
-    vpx_memset(&mi_base[i * stride], 0, sizeof(MODE_INFO));
-  }
+  for (i = 1; i < cm->mi_rows + 1; i++)
+    vpx_memset(&mi[i * stride], 0, sizeof(MODE_INFO));
 }
 
-void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi) {
+void vp9_update_mode_info_in_image(VP9_COMMON *cm, MODE_INFO *mi) {
   int i, j;
 
   // For each in image mode_info element set the in image flag to 1
-  for (i = 0; i < cpi->mb_rows; i++) {
-    for (j = 0; j < cpi->mb_cols; j++) {
-      mi->mbmi.mb_in_image = 1;
-      mi++;   // Next element in the row
+  for (i = 0; i < cm->mi_rows; i++) {
+    MODE_INFO *ptr = mi;
+    for (j = 0; j < cm->mi_cols; j++) {
+      ptr->mbmi.mb_in_image = 1;
+      ptr++;  // Next element in the row
     }
 
-    mi++;       // Step over border element at start of next row
+    // Step over border element at start of next row
+    mi += cm->mode_info_stride;
   }
 }
 
-void vp9_de_alloc_frame_buffers(VP9_COMMON *oci) {
+void vp9_free_frame_buffers(VP9_COMMON *oci) {
   int i;
 
   for (i = 0; i < NUM_YV12_BUFFERS; i++)
-    vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);
+    vp9_free_frame_buffer(&oci->yv12_fb[i]);
 
-  vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
-  vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
+  vp9_free_frame_buffer(&oci->temp_scale_frame);
+  vp9_free_frame_buffer(&oci->post_proc_buffer);
 
-  vpx_free(oci->above_context);
   vpx_free(oci->mip);
   vpx_free(oci->prev_mip);
+  vpx_free(oci->above_seg_context);
 
-  oci->above_context = 0;
+  vpx_free(oci->above_context[0]);
+  for (i = 0; i < MAX_MB_PLANE; i++)
+    oci->above_context[i] = 0;
   oci->mip = 0;
   oci->prev_mip = 0;
+  oci->above_seg_context = 0;
+}
 
+static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
+  cm->mb_cols = (aligned_width + 8) >> 4;
+  cm->mb_rows = (aligned_height + 8) >> 4;
+  cm->MBs = cm->mb_rows * cm->mb_cols;
+
+  cm->mi_cols = aligned_width >> LOG2_MI_SIZE;
+  cm->mi_rows = aligned_height >> LOG2_MI_SIZE;
+  cm->mode_info_stride = cm->mi_cols + 64 / MI_SIZE;
 }
 
+static void setup_mi(VP9_COMMON *cm) {
+  cm->mi = cm->mip + cm->mode_info_stride + 1;
+  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
+
+  vpx_memset(cm->mip, 0,
+             cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));
+
+  vp9_update_mode_info_border(cm, cm->mip);
+  vp9_update_mode_info_in_image(cm, cm->mi);
+
+  vp9_update_mode_info_border(cm, cm->prev_mip);
+  vp9_update_mode_info_in_image(cm, cm->prev_mi);
+}
+
 int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
-  int i;
-  int aligned_width, aligned_height;
+  int i, mi_cols;
 
-  vp9_de_alloc_frame_buffers(oci);
+  // Our internal buffers are always multiples of 16
+  const int aligned_width = multiple8(width);
+  const int aligned_height = multiple8(height);
+  const int ss_x = oci->subsampling_x;
+  const int ss_y = oci->subsampling_y;
 
-  /* our internal buffers are always multiples of 16 */
-  aligned_width = (width + 15) & ~15;
-  aligned_height = (height + 15) & ~15;
+  vp9_free_frame_buffers(oci);
 
   for (i = 0; i < NUM_YV12_BUFFERS; i++) {
     oci->fb_idx_ref_cnt[i] = 0;
-    if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height,
-                                    VP9BORDERINPIXELS) < 0) {
-      vp9_de_alloc_frame_buffers(oci);
-      return 1;
-    }
+    if (vp9_alloc_frame_buffer(&oci->yv12_fb[i], width, height, ss_x, ss_y,
+                               VP9BORDERINPIXELS) < 0)
+      goto fail;
   }
 
   oci->new_fb_idx = NUM_YV12_BUFFERS - 1;
   oci->fb_idx_ref_cnt[oci->new_fb_idx] = 1;
 
-  for (i = 0; i < 3; i++)
+  for (i = 0; i < ALLOWED_REFS_PER_FRAME; i++)
     oci->active_ref_idx[i] = i;
 
   for (i = 0; i < NUM_REF_FRAMES; i++) {
@@ -95,125 +120,86 @@
     oci->fb_idx_ref_cnt[i] = 1;
   }
 
-  if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame, width, 16,
-                                  VP9BORDERINPIXELS) < 0) {
-    vp9_de_alloc_frame_buffers(oci);
-    return 1;
-  }
+  if (vp9_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, ss_x, ss_y,
+                             VP9BORDERINPIXELS) < 0)
+    goto fail;
 
-  if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height,
-                                  VP9BORDERINPIXELS) < 0) {
-    vp9_de_alloc_frame_buffers(oci);
-    return 1;
-  }
+  if (vp9_alloc_frame_buffer(&oci->post_proc_buffer, width, height, ss_x, ss_y,
+                             VP9BORDERINPIXELS) < 0)
+    goto fail;
 
-  oci->mb_rows = aligned_height >> 4;
-  oci->mb_cols = aligned_width >> 4;
-  oci->MBs = oci->mb_rows * oci->mb_cols;
-  oci->mode_info_stride = oci->mb_cols + 1;
-  oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
+  set_mb_mi(oci, aligned_width, aligned_height);
 
-  if (!oci->mip) {
-    vp9_de_alloc_frame_buffers(oci);
-    return 1;
-  }
+  // Allocation
+  oci->mip = vpx_calloc(oci->mode_info_stride * (oci->mi_rows + 64 / MI_SIZE),
+                        sizeof(MODE_INFO));
+  if (!oci->mip)
+    goto fail;
 
-  oci->mi = oci->mip + oci->mode_info_stride + 1;
+  oci->prev_mip = vpx_calloc(oci->mode_info_stride *
+                             (oci->mi_rows + 64 / MI_SIZE),
+                             sizeof(MODE_INFO));
+  if (!oci->prev_mip)
+    goto fail;
 
-  /* allocate memory for last frame MODE_INFO array */
+  setup_mi(oci);
 
-  oci->prev_mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
+  // FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling
+  // information is exposed at this level
+  mi_cols = mi_cols_aligned_to_sb(oci);
 
-  if (!oci->prev_mip) {
-    vp9_de_alloc_frame_buffers(oci);
-    return 1;
-  }
+  // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
+  // block where mi unit size is 8x8.
+# if CONFIG_ALPHA
+  oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * 8 * mi_cols, 1);
+#else
+  oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * 6 * mi_cols, 1);
+#endif
+  if (!oci->above_context[0])
+    goto fail;
 
-  oci->prev_mi = oci->prev_mip + oci->mode_info_stride + 1;
+  for (i = 1; i < MAX_MB_PLANE; i++)
+    oci->above_context[i] =
+        oci->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols;
 
-  oci->above_context =
-    vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * (3 + oci->mb_cols), 1);
+  oci->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
+  if (!oci->above_seg_context)
+    goto fail;
 
-  if (!oci->above_context) {
-    vp9_de_alloc_frame_buffers(oci);
-    return 1;
-  }
-
-  vp9_update_mode_info_border(oci, oci->mip);
-  vp9_update_mode_info_in_image(oci, oci->mi);
-
   return 0;
-}
 
-void vp9_setup_version(VP9_COMMON *cm) {
-  if (cm->version & 0x4) {
-    if (!CONFIG_EXPERIMENTAL)
-      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
-                         "Bitstream was created by an experimental "
-                         "encoder");
-    cm->experimental = 1;
-  }
-
-  switch (cm->version & 0x3) {
-    case 0:
-      cm->no_lpf = 0;
-      cm->filter_type = NORMAL_LOOPFILTER;
-      cm->use_bilinear_mc_filter = 0;
-      cm->full_pixel = 0;
-      break;
-    case 1:
-      cm->no_lpf = 0;
-      cm->filter_type = SIMPLE_LOOPFILTER;
-      cm->use_bilinear_mc_filter = 1;
-      cm->full_pixel = 0;
-      break;
-    case 2:
-    case 3:
-      cm->no_lpf = 1;
-      cm->filter_type = NORMAL_LOOPFILTER;
-      cm->use_bilinear_mc_filter = 1;
-      cm->full_pixel = 0;
-      break;
-      // Full pel only code deprecated in experimental code base
-      // case 3:
-      //    cm->no_lpf = 1;
-      //    cm->filter_type = SIMPLE_LOOPFILTER;
-      //    cm->use_bilinear_mc_filter = 1;
-      //    cm->full_pixel = 1;
-      //    break;
-  }
+ fail:
+  vp9_free_frame_buffers(oci);
+  return 1;
 }
+
 void vp9_create_common(VP9_COMMON *oci) {
   vp9_machine_specific_config(oci);
 
   vp9_init_mbmode_probs(oci);
 
-  vp9_default_bmode_probs(oci->fc.bmode_prob);
-
   oci->txfm_mode = ONLY_4X4;
-  oci->mb_no_coeff_skip = 1;
   oci->comp_pred_mode = HYBRID_PREDICTION;
-  oci->no_lpf = 0;
-  oci->filter_type = NORMAL_LOOPFILTER;
-  oci->use_bilinear_mc_filter = 0;
-  oci->full_pixel = 0;
   oci->clr_type = REG_YUV;
-  oci->clamp_type = RECON_CLAMP_REQUIRED;
 
-  /* Initialise reference frame sign bias structure to defaults */
+  // Initialize reference frame sign bias structure to defaults
   vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
-
-  oci->kf_ymode_probs_update = 0;
 }
 
 void vp9_remove_common(VP9_COMMON *oci) {
-  vp9_de_alloc_frame_buffers(oci);
+  vp9_free_frame_buffers(oci);
 }
 
 void vp9_initialize_common() {
   vp9_coef_tree_initialize();
-
   vp9_entropy_mode_init();
-
   vp9_entropy_mv_init();
+}
+
+void vp9_update_frame_size(VP9_COMMON *cm) {
+  const int aligned_width = multiple8(cm->width);
+  const int aligned_height = multiple8(cm->height);
+
+  set_mb_mi(cm, aligned_width, aligned_height);
+  setup_mi(cm);
 }
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h
@@ -14,13 +14,18 @@
 
 #include "vp9/common/vp9_onyxc_int.h"
 
+void vp9_initialize_common();
+
+void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi);
+void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi);
+
 void vp9_create_common(VP9_COMMON *oci);
 void vp9_remove_common(VP9_COMMON *oci);
-void vp9_de_alloc_frame_buffers(VP9_COMMON *oci);
+
 int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height);
-void vp9_setup_version(VP9_COMMON *oci);
+void vp9_free_frame_buffers(VP9_COMMON *oci);
 
-void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base);
-void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi);
+
+void vp9_update_frame_size(VP9_COMMON *cm);
 
 #endif  // VP9_COMMON_VP9_ALLOCCOMMON_H_
--- a/vp9/common/vp9_blockd.c
+++ /dev/null
@@ -1,442 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9/common/vp9_blockd.h"
-#include "vpx_mem/vpx_mem.h"
-
-const uint8_t vp9_block2left[TX_SIZE_MAX_MB][24] = {
-  { 0, 0, 0, 0,
-    1, 1, 1, 1,
-    2, 2, 2, 2,
-    3, 3, 3, 3,
-    4, 4,
-    5, 5,
-    6, 6,
-    7, 7 },
-  { 0, 0, 0, 0,
-    0, 0, 0, 0,
-    2, 2, 2, 2,
-    2, 2, 2, 2,
-    4, 4,
-    4, 4,
-    6, 6,
-    6, 6 },
-  { 0, 0, 0, 0,
-    0, 0, 0, 0,
-    0, 0, 0, 0,
-    0, 0, 0, 0 },
-};
-const uint8_t vp9_block2above[TX_SIZE_MAX_MB][24] = {
-  { 0, 1, 2, 3,
-    0, 1, 2, 3,
-    0, 1, 2, 3,
-    0, 1, 2, 3,
-    4, 5,
-    4, 5,
-    6, 7,
-    6, 7 },
-  { 0, 0, 0, 0,
-    2, 2, 2, 2,
-    0, 0, 0, 0,
-    2, 2, 2, 2,
-    4, 4,
-    4, 4,
-    6, 6,
-    6, 6 },
-  { 0, 0, 0, 0,
-    0, 0, 0, 0,
-    0, 0, 0, 0,
-    0, 0, 0, 0 },
-};
-
-#define S(x) x + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT)
-const uint8_t vp9_block2left_sb[TX_SIZE_MAX_SB][96] = {
-  { 0, 0, 0, 0, 0, 0, 0, 0,
-    1, 1, 1, 1, 1, 1, 1, 1,
-    2, 2, 2, 2, 2, 2, 2, 2,
-    3, 3, 3, 3, 3, 3, 3, 3,
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1),
-    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),
-    S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3),
-    4, 4, 4, 4,
-    5, 5, 5, 5,
-    S(4), S(4), S(4), S(4),
-    S(5), S(5), S(5), S(5),
-    6, 6, 6, 6,
-    7, 7, 7, 7,
-    S(6), S(6), S(6), S(6),
-    S(7), S(7), S(7), S(7) },
-  { 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    2, 2, 2, 2, 2, 2, 2, 2,
-    2, 2, 2, 2, 2, 2, 2, 2,
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),
-    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),
-    4, 4, 4, 4,
-    4, 4, 4, 4,
-    S(4), S(4), S(4), S(4),
-    S(4), S(4), S(4), S(4),
-    6, 6, 6, 6,
-    6, 6, 6, 6,
-    S(6), S(6), S(6), S(6),
-    S(6), S(6), S(6), S(6) },
-  { 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    4, 4, 4, 4,
-    4, 4, 4, 4,
-    4, 4, 4, 4,
-    4, 4, 4, 4,
-    6, 6, 6, 6,
-    6, 6, 6, 6,
-    6, 6, 6, 6,
-    6, 6, 6, 6 },
-  { 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0 },
-};
-const uint8_t vp9_block2above_sb[TX_SIZE_MAX_SB][96] = {
-  { 0, 1, 2, 3, S(0), S(1), S(2), S(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3),
-    4, 5, S(4), S(5),
-    4, 5, S(4), S(5),
-    4, 5, S(4), S(5),
-    4, 5, S(4), S(5),
-    6, 7, S(6), S(7),
-    6, 7, S(6), S(7),
-    6, 7, S(6), S(7),
-    6, 7, S(6), S(7) },
-  { 0, 0, 0, 0, 2, 2, 2, 2,
-    S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
-    0, 0, 0, 0, 2, 2, 2, 2,
-    S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
-    0, 0, 0, 0, 2, 2, 2, 2,
-    S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
-    0, 0, 0, 0, 2, 2, 2, 2,
-    S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
-    4, 4, 4, 4,
-    S(4), S(4), S(4), S(4),
-    4, 4, 4, 4,
-    S(4), S(4), S(4), S(4),
-    6, 6, 6, 6,
-    S(6), S(6), S(6), S(6),
-    6, 6, 6, 6,
-    S(6), S(6), S(6), S(6) },
-  { 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    4, 4, 4, 4,
-    4, 4, 4, 4,
-    4, 4, 4, 4,
-    4, 4, 4, 4,
-    6, 6, 6, 6,
-    6, 6, 6, 6,
-    6, 6, 6, 6,
-    6, 6, 6, 6 },
-  { 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0 },
-};
-
-#define T(x) x + 2 * (sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT))
-#define U(x) x + 3 * (sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT))
-const uint8_t vp9_block2left_sb64[TX_SIZE_MAX_SB][384] = {
-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1),
-    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),
-    S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3),
-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
-    T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1),
-    T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2),
-    T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3),
-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
-    U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1),
-    U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2),
-    U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3),
-    4, 4, 4, 4, 4, 4, 4, 4,
-    5, 5, 5, 5, 5, 5, 5, 5,
-    S(4), S(4), S(4), S(4), S(4), S(4), S(4), S(4),
-    S(5), S(5), S(5), S(5), S(5), S(5), S(5), S(5),
-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
-    T(5), T(5), T(5), T(5), T(5), T(5), T(5), T(5),
-    U(4), U(4), U(4), U(4), U(4), U(4), U(4), U(4),
-    U(5), U(5), U(5), U(5), U(5), U(5), U(5), U(5),
-    6, 6, 6, 6, 6, 6, 6, 6,
-    7, 7, 7, 7, 7, 7, 7, 7,
-    S(6), S(6), S(6), S(6), S(6), S(6), S(6), S(6),
-    S(7), S(7), S(7), S(7), S(7), S(7), S(7), S(7),
-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),
-    T(7), T(7), T(7), T(7), T(7), T(7), T(7), T(7),
-    U(6), U(6), U(6), U(6), U(6), U(6), U(6), U(6),
-    U(7), U(7), U(7), U(7), U(7), U(7), U(7), U(7) },
-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),
-    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),
-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
-    T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2),
-    T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2),
-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
-    U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2),
-    U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2),
-    4, 4, 4, 4, 4, 4, 4, 4,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    S(4), S(4), S(4), S(4), S(4), S(4), S(4), S(4),
-    S(4), S(4), S(4), S(4), S(4), S(4), S(4), S(4),
-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
-    U(4), U(4), U(4), U(4), U(4), U(4), U(4), U(4),
-    U(4), U(4), U(4), U(4), U(4), U(4), U(4), U(4),
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    S(6), S(6), S(6), S(6), S(6), S(6), S(6), S(6),
-    S(6), S(6), S(6), S(6), S(6), S(6), S(6), S(6),
-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),
-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),
-    U(6), U(6), U(6), U(6), U(6), U(6), U(6), U(6),
-    U(6), U(6), U(6), U(6), U(6), U(6), U(6), U(6) },
-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
-    4, 4, 4, 4, 4, 4, 4, 4,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),
-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),
-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),
-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6) },
-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6 },
-};
-const uint8_t vp9_block2above_sb64[TX_SIZE_MAX_SB][384] = {
-  { 0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
-    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),
-    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),
-    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),
-    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),
-    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),
-    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),
-    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),
-    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),
-    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),
-    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),
-    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),
-    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),
-    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),
-    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),
-    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),
-    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),
-    6, 7, S(6), S(7), T(6), T(7), U(6), U(7) },
-  { 0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
-    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),
-    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
-    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),
-    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
-    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),
-    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
-    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),
-    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
-    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),
-    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
-    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),
-    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
-    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),
-    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),
-    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),
-    4, 4, 4, 4, S(4), S(4), S(4), S(4),
-    T(4), T(4), T(4), T(4), U(4), U(4), U(4), U(4),
-    4, 4, 4, 4, S(4), S(4), S(4), S(4),
-    T(4), T(4), T(4), T(4), U(4), U(4), U(4), U(4),
-    4, 4, 4, 4, S(4), S(4), S(4), S(4),
-    T(4), T(4), T(4), T(4), U(4), U(4), U(4), U(4),
-    4, 4, 4, 4, S(4), S(4), S(4), S(4),
-    T(4), T(4), T(4), T(4), U(4), U(4), U(4), U(4),
-    6, 6, 6, 6, S(6), S(6), S(6), S(6),
-    T(6), T(6), T(6), T(6), U(6), U(6), U(6), U(6),
-    6, 6, 6, 6, S(6), S(6), S(6), S(6),
-    T(6), T(6), T(6), T(6), U(6), U(6), U(6), U(6),
-    6, 6, 6, 6, S(6), S(6), S(6), S(6),
-    T(6), T(6), T(6), T(6), U(6), U(6), U(6), U(6),
-    6, 6, 6, 6, S(6), S(6), S(6), S(6),
-    T(6), T(6), T(6), T(6), U(6), U(6), U(6), U(6) },
-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),
-    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),
-    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),
-    4, 4, 4, 4, 4, 4, 4, 4,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
-    4, 4, 4, 4, 4, 4, 4, 4,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
-    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),
-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),
-    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6) },
-  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 6, 6, 6, 6 },
-};
-#undef U
-#undef T
-#undef S
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -12,8 +12,6 @@
 #ifndef VP9_COMMON_VP9_BLOCKD_H_
 #define VP9_COMMON_VP9_BLOCKD_H_
 
-void vpx_log(const char *format, ...);
-
 #include "./vpx_config.h"
 #include "vpx_scale/yv12config.h"
 #include "vp9/common/vp9_convolve.h"
@@ -21,35 +19,27 @@
 #include "vp9/common/vp9_treecoder.h"
 #include "vpx_ports/mem.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
 
-#define TRUE    1
-#define FALSE   0
+#define BLOCK_SIZE_GROUPS   4
+#define MAX_MB_SEGMENTS     8
+#define MB_SEG_TREE_PROBS   (MAX_MB_SEGMENTS-1)
 
-// #define MODE_STATS
-
-/*#define DCPRED 1*/
-#define DCPREDSIMTHRESH 0
-#define DCPREDCNTTHRESH 3
-
-#define MB_FEATURE_TREE_PROBS   3
 #define PREDICTION_PROBS 3
 
 #define MBSKIP_CONTEXTS 3
 
-#define MAX_MB_SEGMENTS         4
-
 #define MAX_REF_LF_DELTAS       4
-#define MAX_MODE_LF_DELTAS      4
+#define MAX_MODE_LF_DELTAS      2
 
 /* Segment Feature Masks */
 #define SEGMENT_DELTADATA   0
 #define SEGMENT_ABSDATA     1
-#define MAX_MV_REFS 9
-#define MAX_MV_REF_CANDIDATES 4
+#define MAX_MV_REF_CANDIDATES 2
 
-typedef struct {
-  int r, c;
-} POS;
+#define INTRA_INTER_CONTEXTS 4
+#define COMP_INTER_CONTEXTS 5
+#define REF_CONTEXTS 5
 
 typedef enum {
   PLANE_TYPE_Y_WITH_DC,
@@ -57,24 +47,21 @@
 } PLANE_TYPE;
 
 typedef char ENTROPY_CONTEXT;
-typedef struct {
-  ENTROPY_CONTEXT y1[4];
-  ENTROPY_CONTEXT u[2];
-  ENTROPY_CONTEXT v[2];
-} ENTROPY_CONTEXT_PLANES;
 
-#define VP9_COMBINEENTROPYCONTEXTS(Dest, A, B) \
-  Dest = ((A)!=0) + ((B)!=0);
+typedef char PARTITION_CONTEXT;
 
+static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
+                                           ENTROPY_CONTEXT b) {
+  return (a != 0) + (b != 0);
+}
+
 typedef enum {
   KEY_FRAME = 0,
-  INTER_FRAME = 1
+  INTER_FRAME = 1,
+  NUM_FRAME_TYPES,
 } FRAME_TYPE;
 
 typedef enum {
-#if CONFIG_ENABLE_6TAP
-  SIXTAP,
-#endif
   EIGHTTAP_SMOOTH,
   EIGHTTAP,
   EIGHTTAP_SHARP,
@@ -83,26 +70,27 @@
 } INTERPOLATIONFILTERTYPE;
 
 typedef enum {
-  DC_PRED,            /* average of above and left pixels */
-  V_PRED,             /* vertical prediction */
-  H_PRED,             /* horizontal prediction */
-  D45_PRED,           /* Directional 45 deg prediction  [anti-clockwise from 0 deg hor] */
-  D135_PRED,          /* Directional 135 deg prediction [anti-clockwise from 0 deg hor] */
-  D117_PRED,          /* Directional 112 deg prediction [anti-clockwise from 0 deg hor] */
-  D153_PRED,          /* Directional 157 deg prediction [anti-clockwise from 0 deg hor] */
-  D27_PRED,           /* Directional 22 deg prediction  [anti-clockwise from 0 deg hor] */
-  D63_PRED,           /* Directional 67 deg prediction  [anti-clockwise from 0 deg hor] */
-  TM_PRED,            /* Truemotion prediction */
-  I8X8_PRED,          /* 8x8 based prediction, each 8x8 has its own prediction mode */
-  B_PRED,             /* block based prediction, each block has its own prediction mode */
+  DC_PRED,         // Average of above and left pixels
+  V_PRED,          // Vertical
+  H_PRED,          // Horizontal
+  D45_PRED,        // Directional 45  deg = round(arctan(1/1) * 180/pi)
+  D135_PRED,       // Directional 135 deg = 180 - 45
+  D117_PRED,       // Directional 117 deg = 180 - 63
+  D153_PRED,       // Directional 153 deg = 180 - 27
+  D27_PRED,        // Directional 27  deg = round(arctan(1/2) * 180/pi)
+  D63_PRED,        // Directional 63  deg = round(arctan(2/1) * 180/pi)
+  TM_PRED,         // True-motion
   NEARESTMV,
   NEARMV,
   ZEROMV,
   NEWMV,
-  SPLITMV,
   MB_MODE_COUNT
 } MB_PREDICTION_MODE;
 
+static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) {
+  return mode >= NEARESTMV && mode <= NEWMV;
+}
+
 // Segment level features.
 typedef enum {
   SEG_LVL_ALT_Q = 0,               // Use alternate Quantizer ....
@@ -117,8 +105,7 @@
   TX_4X4 = 0,                      // 4x4 dct transform
   TX_8X8 = 1,                      // 8x8 dct transform
   TX_16X16 = 2,                    // 16x16 dct transform
-  TX_SIZE_MAX_MB = 3,              // Number of different transforms available
-  TX_32X32 = TX_SIZE_MAX_MB,       // 32x32 dct transform
+  TX_32X32 = 3,                    // 32x32 dct transform
   TX_SIZE_MAX_SB,                  // Number of transforms available to SBs
 } TX_SIZE;
 
@@ -129,62 +116,19 @@
   ADST_ADST = 3                       // ADST in both directions
 } TX_TYPE;
 
-#define VP9_YMODES  (B_PRED + 1)
-#define VP9_UV_MODES (TM_PRED + 1)
-#define VP9_I8X8_MODES (TM_PRED + 1)
-#define VP9_I32X32_MODES (TM_PRED + 1)
+#define VP9_INTRA_MODES (TM_PRED + 1)
 
-#define VP9_MVREFS (1 + SPLITMV - NEARESTMV)
+#define VP9_INTER_MODES (1 + NEWMV - NEARESTMV)
 
 #define WHT_UPSCALE_FACTOR 2
 
-typedef enum {
-  B_DC_PRED,          /* average of above and left pixels */
-  B_TM_PRED,
+#define TX_SIZE_PROBS  6  // (TX_SIZE_MAX_SB * (TX_SIZE_MAX_SB - 1) / 2)
 
-  B_VE_PRED,          /* vertical prediction */
-  B_HE_PRED,          /* horizontal prediction */
+#define get_tx_probs(c, b) ((b) < BLOCK_SIZE_MB16X16 ? \
+                            (c)->fc.tx_probs_8x8p :    \
+                            (b) < BLOCK_SIZE_SB32X32 ? \
+                            (c)->fc.tx_probs_16x16p : (c)->fc.tx_probs_32x32p)
 
-  B_LD_PRED,
-  B_RD_PRED,
-
-  B_VR_PRED,
-  B_VL_PRED,
-  B_HD_PRED,
-  B_HU_PRED,
-#if CONFIG_NEWBINTRAMODES
-  B_CONTEXT_PRED,
-#endif
-
-  LEFT4X4,
-  ABOVE4X4,
-  ZERO4X4,
-  NEW4X4,
-
-  B_MODE_COUNT
-} B_PREDICTION_MODE;
-
-#define VP9_BINTRAMODES (LEFT4X4)
-#define VP9_SUBMVREFS (1 + NEW4X4 - LEFT4X4)
-
-#if CONFIG_NEWBINTRAMODES
-/* The number of B_PRED intra modes that are replaced by B_CONTEXT_PRED */
-#define CONTEXT_PRED_REPLACEMENTS  0
-#define VP9_KF_BINTRAMODES (VP9_BINTRAMODES - 1)
-#define VP9_NKF_BINTRAMODES  (VP9_BINTRAMODES - CONTEXT_PRED_REPLACEMENTS)
-#else
-#define VP9_KF_BINTRAMODES (VP9_BINTRAMODES)   /* 10 */
-#define VP9_NKF_BINTRAMODES (VP9_BINTRAMODES)  /* 10 */
-#endif
-
-typedef enum {
-  PARTITIONING_16X8 = 0,
-  PARTITIONING_8X16,
-  PARTITIONING_8X8,
-  PARTITIONING_4X4,
-  NB_PARTITIONINGS,
-} SPLITMV_PARTITIONING_TYPE;
-
 /* For keyframes, intra block modes are predicted by the (already decoded)
    modes for the Y blocks to the left and above us; for interframes, there
    is a single probability table. */
@@ -191,11 +135,7 @@
 
 union b_mode_info {
   struct {
-    B_PREDICTION_MODE first;
-    TX_TYPE           tx_type;
-#if CONFIG_NEWBINTRAMODES
-    B_PREDICTION_MODE context;
-#endif
+    MB_PREDICTION_MODE first;
   } as_mode;
   int_mv as_mv[2];  // first, second inter predictor motion vectors
 };
@@ -209,37 +149,80 @@
   MAX_REF_FRAMES = 4
 } MV_REFERENCE_FRAME;
 
-typedef enum {
-  BLOCK_SIZE_MB16X16 = 0,
-  BLOCK_SIZE_SB32X32 = 1,
-  BLOCK_SIZE_SB64X64 = 2,
-} BLOCK_SIZE_TYPE;
+static INLINE int b_width_log2(BLOCK_SIZE_TYPE sb_type) {
+  switch (sb_type) {
+    case BLOCK_SIZE_SB4X8:
+    case BLOCK_SIZE_AB4X4: return 0;
+    case BLOCK_SIZE_SB8X4:
+    case BLOCK_SIZE_SB8X8:
+    case BLOCK_SIZE_SB8X16: return 1;
+    case BLOCK_SIZE_SB16X8:
+    case BLOCK_SIZE_MB16X16:
+    case BLOCK_SIZE_SB16X32: return 2;
+    case BLOCK_SIZE_SB32X16:
+    case BLOCK_SIZE_SB32X32:
+    case BLOCK_SIZE_SB32X64: return 3;
+    case BLOCK_SIZE_SB64X32:
+    case BLOCK_SIZE_SB64X64: return 4;
+    default: assert(0);
+      return -1;
+  }
+}
 
+static INLINE int b_height_log2(BLOCK_SIZE_TYPE sb_type) {
+  switch (sb_type) {
+    case BLOCK_SIZE_SB8X4:
+    case BLOCK_SIZE_AB4X4: return 0;
+    case BLOCK_SIZE_SB4X8:
+    case BLOCK_SIZE_SB8X8:
+    case BLOCK_SIZE_SB16X8: return 1;
+    case BLOCK_SIZE_SB8X16:
+    case BLOCK_SIZE_MB16X16:
+    case BLOCK_SIZE_SB32X16: return 2;
+    case BLOCK_SIZE_SB16X32:
+    case BLOCK_SIZE_SB32X32:
+    case BLOCK_SIZE_SB64X32: return 3;
+    case BLOCK_SIZE_SB32X64:
+    case BLOCK_SIZE_SB64X64: return 4;
+    default: assert(0);
+      return -1;
+  }
+}
+
+static INLINE int mi_width_log2(BLOCK_SIZE_TYPE sb_type) {
+  int a = b_width_log2(sb_type) - 1;
+  // align 4x4 block to mode_info
+  if (a < 0)
+    a = 0;
+  assert(a >= 0);
+  return a;
+}
+
+static INLINE int mi_height_log2(BLOCK_SIZE_TYPE sb_type) {
+  int a = b_height_log2(sb_type) - 1;
+  if (a < 0)
+    a = 0;
+  assert(a >= 0);
+  return a;
+}
+
 typedef struct {
   MB_PREDICTION_MODE mode, uv_mode;
-#if CONFIG_COMP_INTERINTRA_PRED
-  MB_PREDICTION_MODE interintra_mode, interintra_uv_mode;
-#endif
-  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
+  MV_REFERENCE_FRAME ref_frame[2];
   TX_SIZE txfm_size;
   int_mv mv[2]; // for each reference frame used
   int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
   int_mv best_mv, best_second_mv;
-#if CONFIG_NEW_MVREF
-  int best_index, best_second_index;
-#endif
 
   int mb_mode_context[MAX_REF_FRAMES];
 
-  SPLITMV_PARTITIONING_TYPE partitioning;
   unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
   unsigned char need_to_clamp_mvs;
   unsigned char need_to_clamp_secondmv;
-  unsigned char segment_id;                  /* Which set of segmentation parameters should be used for this MB */
+  unsigned char segment_id;           // Segment id for current frame
 
   // Flags used for prediction status of various bistream signals
   unsigned char seg_id_predicted;
-  unsigned char ref_predicted;
 
   // Indicates if the mb is part of the image (1) vs border (0)
   // This can be useful in determining whether the MB provides
@@ -249,69 +232,62 @@
   INTERPOLATIONFILTERTYPE interp_filter;
 
   BLOCK_SIZE_TYPE sb_type;
-#if CONFIG_CODE_NONZEROCOUNT
-  uint16_t nzcs[256+64*2];
-#endif
 } MB_MODE_INFO;
 
 typedef struct {
   MB_MODE_INFO mbmi;
-  union b_mode_info bmi[16];
+  union b_mode_info bmi[4];
 } MODE_INFO;
 
-typedef struct blockd {
-  int16_t *qcoeff;
-  int16_t *dqcoeff;
-  uint8_t *predictor;
-  int16_t *diff;
-  int16_t *dequant;
-
-  /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
-  uint8_t **base_pre;
-  uint8_t **base_second_pre;
-  int pre;
-  int pre_stride;
-
-  uint8_t **base_dst;
-  int dst;
-  int dst_stride;
-
-  union b_mode_info bmi;
-} BLOCKD;
-
+#define VP9_REF_SCALE_SHIFT 14
 struct scale_factors {
-  int x_num;
-  int x_den;
+  int x_scale_fp;   // horizontal fixed point scale factor
+  int y_scale_fp;   // vertical fixed point scale factor
   int x_offset_q4;
   int x_step_q4;
-  int y_num;
-  int y_den;
   int y_offset_q4;
   int y_step_q4;
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-  convolve_fn_t predict[2][2][8];  // horiz, vert, weight (0 - 7)
-#else
+
+  int (*scale_value_x)(int val, const struct scale_factors *scale);
+  int (*scale_value_y)(int val, const struct scale_factors *scale);
+  void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col);
+  int_mv32 (*scale_mv_q3_to_q4)(const int_mv *src_mv,
+                                const struct scale_factors *scale);
+  int32_t (*scale_mv_component_q4)(int mv_q4, int scale_fp, int offset_q4);
+
   convolve_fn_t predict[2][2][2];  // horiz, vert, avg
-#endif
 };
 
-typedef struct macroblockd {
-  DECLARE_ALIGNED(16, int16_t,  diff[64*64+32*32*2]);      /* from idct diff */
-  DECLARE_ALIGNED(16, uint8_t,  predictor[384]);  // unused for superblocks
-  DECLARE_ALIGNED(16, int16_t,  qcoeff[64*64+32*32*2]);
-  DECLARE_ALIGNED(16, int16_t,  dqcoeff[64*64+32*32*2]);
-  DECLARE_ALIGNED(16, uint16_t, eobs[256+64*2]);
-#if CONFIG_CODE_NONZEROCOUNT
-  DECLARE_ALIGNED(16, uint16_t, nzcs[256+64*2]);
+#if CONFIG_ALPHA
+enum { MAX_MB_PLANE = 4 };
+#else
+enum { MAX_MB_PLANE = 3 };
 #endif
 
-  /* 16 Y blocks, 4 U, 4 V, each with 16 entries. */
-  BLOCKD block[24];
-  int fullpixel_mask;
+struct buf_2d {
+  uint8_t *buf;
+  int stride;
+};
 
-  YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
-  YV12_BUFFER_CONFIG second_pre;
-  YV12_BUFFER_CONFIG dst;
+struct macroblockd_plane {
+  DECLARE_ALIGNED(16, int16_t,  qcoeff[64 * 64]);
+  DECLARE_ALIGNED(16, int16_t,  dqcoeff[64 * 64]);
+  DECLARE_ALIGNED(16, uint16_t, eobs[256]);
+  PLANE_TYPE plane_type;
+  int subsampling_x;
+  int subsampling_y;
+  struct buf_2d dst;
+  struct buf_2d pre[2];
+  int16_t *dequant;
+  ENTROPY_CONTEXT *above_context;
+  ENTROPY_CONTEXT *left_context;
+};
+
+#define BLOCK_OFFSET(x, i, n) ((x) + (i) * (n))
+
+typedef struct macroblockd {
+  struct macroblockd_plane plane[MAX_MB_PLANE];
+
   struct scale_factors scale_factor[2];
   struct scale_factors scale_factor_uv[2];
 
@@ -325,11 +301,11 @@
   int left_available;
   int right_available;
 
-  /* Y,U,V */
-  ENTROPY_CONTEXT_PLANES *above_context;
-  ENTROPY_CONTEXT_PLANES *left_context;
+  // partition contexts
+  PARTITION_CONTEXT *above_seg_context;
+  PARTITION_CONTEXT *left_seg_context;
 
-  /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */
+  /* 0 (disable) 1 (enable) segmentation */
   unsigned char segmentation_enabled;
 
   /* 0 (do not update) 1 (update) the macroblock segmentation map. */
@@ -345,15 +321,10 @@
   /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */
 
   // Probability Tree used to code Segment number
-  vp9_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];
-  vp9_prob mb_segment_mispred_tree_probs[MAX_MB_SEGMENTS];
+  vp9_prob mb_segment_tree_probs[MB_SEG_TREE_PROBS];
 
-#if CONFIG_NEW_MVREF
-  vp9_prob mb_mv_ref_probs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES-1];
-#endif
-
   // Segment features
-  signed char segment_feature_data[MAX_MB_SEGMENTS][SEG_LVL_MAX];
+  int16_t segment_feature_data[MAX_MB_SEGMENTS][SEG_LVL_MAX];
   unsigned int segment_feature_mask[MAX_MB_SEGMENTS];
 
   /* mode_based Loop filter adjustment */
@@ -361,10 +332,14 @@
   unsigned char mode_ref_lf_delta_update;
 
   /* Delta values have the range +/- MAX_LOOP_FILTER */
-  signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];                /* 0 = Intra, Last, GF, ARF */
-  signed char ref_lf_deltas[MAX_REF_LF_DELTAS];                     /* 0 = Intra, Last, GF, ARF */
-  signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];              /* 0 = BPRED, ZERO_MV, MV, SPLIT */
-  signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];                   /* 0 = BPRED, ZERO_MV, MV, SPLIT */
+  /* 0 = Intra, Last, GF, ARF */
+  signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];
+  /* 0 = Intra, Last, GF, ARF */
+  signed char ref_lf_deltas[MAX_REF_LF_DELTAS];
+  /* 0 = ZERO_MV, MV */
+  signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
+  /* 0 = ZERO_MV, MV */
+  signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];
 
   /* Distance of MB away from frame edges */
   int mb_to_left_edge;
@@ -377,15 +352,13 @@
 
   int lossless;
   /* Inverse transform function pointers. */
-  void (*inv_txm4x4_1)(int16_t *input, int16_t *output, int pitch);
-  void (*inv_txm4x4)(int16_t *input, int16_t *output, int pitch);
-  void (*itxm_add)(int16_t *input, const int16_t *dq,
-    uint8_t *pred, uint8_t *output, int pitch, int stride, int eob);
-  void (*itxm_add_y_block)(int16_t *q, const int16_t *dq,
-    uint8_t *pre, uint8_t *dst, int stride, struct macroblockd *xd);
-  void (*itxm_add_uv_block)(int16_t *q, const int16_t *dq,
-    uint8_t *pre, uint8_t *dst_u, uint8_t *dst_v, int stride,
+  void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride);
+  void (*inv_txm4x4_add)(int16_t *input, uint8_t *dest, int stride);
+  void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob);
+  void (*itxm_add_y_block)(int16_t *q, uint8_t *dst, int stride,
     struct macroblockd *xd);
+  void (*itxm_add_uv_block)(int16_t *q, uint8_t *dst, int stride,
+    uint16_t *eobs);
 
   struct subpix_fn_table  subpix;
 
@@ -393,212 +366,187 @@
 
   int corrupted;
 
-  int sb_index;
-  int mb_index;   // Index of the MB in the SB (0..3)
+  int sb_index;   // index of 32x32 block inside the 64x64 block
+  int mb_index;   // index of 16x16 block inside the 32x32 block
+  int b_index;    // index of 8x8 block inside the 16x16 block
+  int ab_index;   // index of 4x4 block inside the 8x8 block
   int q_index;
 
 } MACROBLOCKD;
 
-#define ACTIVE_HT   110                // quantization stepsize threshold
+static int *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
+  switch (subsize) {
+    case BLOCK_SIZE_SB64X64:
+    case BLOCK_SIZE_SB64X32:
+    case BLOCK_SIZE_SB32X64:
+    case BLOCK_SIZE_SB32X32:
+      return &xd->sb_index;
+    case BLOCK_SIZE_SB32X16:
+    case BLOCK_SIZE_SB16X32:
+    case BLOCK_SIZE_MB16X16:
+      return &xd->mb_index;
+    case BLOCK_SIZE_SB16X8:
+    case BLOCK_SIZE_SB8X16:
+    case BLOCK_SIZE_SB8X8:
+      return &xd->b_index;
+    case BLOCK_SIZE_SB8X4:
+    case BLOCK_SIZE_SB4X8:
+    case BLOCK_SIZE_AB4X4:
+      return &xd->ab_index;
+    default:
+      assert(0);
+      return NULL;
+  }
+}
 
-#define ACTIVE_HT8  300
+static INLINE void update_partition_context(MACROBLOCKD *xd,
+                                            BLOCK_SIZE_TYPE sb_type,
+                                            BLOCK_SIZE_TYPE sb_size) {
+  int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
+  int bwl = b_width_log2(sb_type);
+  int bhl = b_height_log2(sb_type);
+  int boffset = b_width_log2(BLOCK_SIZE_SB64X64) - bsl;
+  int i;
 
-#define ACTIVE_HT16 300
+  // update the partition context at the end notes. set partition bits
+  // of block sizes larger than the current one to be one, and partition
+  // bits of smaller block sizes to be zero.
+  if ((bwl == bsl) && (bhl == bsl)) {
+    for (i = 0; i < bs; i++)
+      xd->left_seg_context[i] = ~(0xf << boffset);
+    for (i = 0; i < bs; i++)
+      xd->above_seg_context[i] = ~(0xf << boffset);
+  } else if ((bwl == bsl) && (bhl < bsl)) {
+    for (i = 0; i < bs; i++)
+      xd->left_seg_context[i] = ~(0xe << boffset);
+    for (i = 0; i < bs; i++)
+      xd->above_seg_context[i] = ~(0xf << boffset);
+  }  else if ((bwl < bsl) && (bhl == bsl)) {
+    for (i = 0; i < bs; i++)
+      xd->left_seg_context[i] = ~(0xf << boffset);
+    for (i = 0; i < bs; i++)
+      xd->above_seg_context[i] = ~(0xe << boffset);
+  } else if ((bwl < bsl) && (bhl < bsl)) {
+    for (i = 0; i < bs; i++)
+      xd->left_seg_context[i] = ~(0xe << boffset);
+    for (i = 0; i < bs; i++)
+      xd->above_seg_context[i] = ~(0xe << boffset);
+  } else {
+    assert(0);
+  }
+}
 
-// convert MB_PREDICTION_MODE to B_PREDICTION_MODE
-static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {
-  switch (mode) {
-    case DC_PRED: return B_DC_PRED;
-    case V_PRED: return B_VE_PRED;
-    case H_PRED: return B_HE_PRED;
-    case TM_PRED: return B_TM_PRED;
-    case D45_PRED: return B_LD_PRED;
-    case D135_PRED: return B_RD_PRED;
-    case D117_PRED: return B_VR_PRED;
-    case D153_PRED: return B_HD_PRED;
-    case D27_PRED: return B_HU_PRED;
-    case D63_PRED: return B_VL_PRED;
+static INLINE int partition_plane_context(MACROBLOCKD *xd,
+                                          BLOCK_SIZE_TYPE sb_type) {
+  int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
+  int above = 0, left = 0, i;
+  int boffset = mi_width_log2(BLOCK_SIZE_SB64X64) - bsl;
+
+  assert(mi_width_log2(sb_type) == mi_height_log2(sb_type));
+  assert(bsl >= 0);
+  assert(boffset >= 0);
+
+  for (i = 0; i < bs; i++)
+    above |= (xd->above_seg_context[i] & (1 << boffset));
+  for (i = 0; i < bs; i++)
+    left |= (xd->left_seg_context[i] & (1 << boffset));
+
+  above = (above > 0);
+  left  = (left > 0);
+
+  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+}
+
+static BLOCK_SIZE_TYPE get_subsize(BLOCK_SIZE_TYPE bsize,
+                                   PARTITION_TYPE partition) {
+  BLOCK_SIZE_TYPE subsize;
+  switch (partition) {
+    case PARTITION_NONE:
+      subsize = bsize;
+      break;
+    case PARTITION_HORZ:
+      if (bsize == BLOCK_SIZE_SB64X64)
+        subsize = BLOCK_SIZE_SB64X32;
+      else if (bsize == BLOCK_SIZE_SB32X32)
+        subsize = BLOCK_SIZE_SB32X16;
+      else if (bsize == BLOCK_SIZE_MB16X16)
+        subsize = BLOCK_SIZE_SB16X8;
+      else if (bsize == BLOCK_SIZE_SB8X8)
+        subsize = BLOCK_SIZE_SB8X4;
+      else
+        assert(0);
+      break;
+    case PARTITION_VERT:
+      if (bsize == BLOCK_SIZE_SB64X64)
+        subsize = BLOCK_SIZE_SB32X64;
+      else if (bsize == BLOCK_SIZE_SB32X32)
+        subsize = BLOCK_SIZE_SB16X32;
+      else if (bsize == BLOCK_SIZE_MB16X16)
+        subsize = BLOCK_SIZE_SB8X16;
+      else if (bsize == BLOCK_SIZE_SB8X8)
+        subsize = BLOCK_SIZE_SB4X8;
+      else
+        assert(0);
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_SIZE_SB64X64)
+        subsize = BLOCK_SIZE_SB32X32;
+      else if (bsize == BLOCK_SIZE_SB32X32)
+        subsize = BLOCK_SIZE_MB16X16;
+      else if (bsize == BLOCK_SIZE_MB16X16)
+        subsize = BLOCK_SIZE_SB8X8;
+      else if (bsize == BLOCK_SIZE_SB8X8)
+        subsize = BLOCK_SIZE_AB4X4;
+      else
+        assert(0);
+      break;
     default:
-       assert(0);
-       return B_MODE_COUNT;  // Dummy value
+      assert(0);
   }
+  return subsize;
 }
 
 // transform mapping
-static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) {
+static TX_TYPE txfm_map(MB_PREDICTION_MODE bmode) {
   switch (bmode) {
-    case B_TM_PRED :
-    case B_RD_PRED :
+    case TM_PRED :
+    case D135_PRED :
       return ADST_ADST;
 
-    case B_VE_PRED :
-    case B_VR_PRED :
+    case V_PRED :
+    case D117_PRED :
+    case D63_PRED:
       return ADST_DCT;
 
-    case B_HE_PRED :
-    case B_HD_PRED :
-    case B_HU_PRED :
+    case H_PRED :
+    case D153_PRED :
+    case D27_PRED :
       return DCT_ADST;
 
-#if CONFIG_NEWBINTRAMODES
-    case B_CONTEXT_PRED:
-      assert(0);
-      break;
-#endif
-
     default:
       return DCT_DCT;
   }
 }
 
-extern const uint8_t vp9_block2left[TX_SIZE_MAX_MB][24];
-extern const uint8_t vp9_block2above[TX_SIZE_MAX_MB][24];
-extern const uint8_t vp9_block2left_sb[TX_SIZE_MAX_SB][96];
-extern const uint8_t vp9_block2above_sb[TX_SIZE_MAX_SB][96];
-extern const uint8_t vp9_block2left_sb64[TX_SIZE_MAX_SB][384];
-extern const uint8_t vp9_block2above_sb64[TX_SIZE_MAX_SB][384];
-
-#define USE_ADST_FOR_I16X16_8X8   1
-#define USE_ADST_FOR_I16X16_4X4   1
-#define USE_ADST_FOR_I8X8_4X4     1
-#define USE_ADST_PERIPHERY_ONLY   1
-#define USE_ADST_FOR_SB           1
-#define USE_ADST_FOR_REMOTE_EDGE  0
-
 static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, int ib) {
-  // TODO(debargha): explore different patterns for ADST usage when blocksize
-  // is smaller than the prediction size
-  TX_TYPE tx_type = DCT_DCT;
-  const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
-#if !USE_ADST_FOR_SB
-  if (sb_type)
-    return tx_type;
-#endif
-  if (ib >= (16 << (2 * sb_type)))  // no chroma adst
-    return tx_type;
-  if (xd->lossless)
+  TX_TYPE tx_type;
+  MODE_INFO *mi = xd->mode_info_context;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  if (xd->lossless || mbmi->ref_frame[0] != INTRA_FRAME)
     return DCT_DCT;
-  if (xd->mode_info_context->mbmi.mode == B_PRED &&
-      xd->q_index < ACTIVE_HT) {
-    const BLOCKD *b = &xd->block[ib];
-    tx_type = txfm_map(
-#if CONFIG_NEWBINTRAMODES
-        b->bmi.as_mode.first == B_CONTEXT_PRED ? b->bmi.as_mode.context :
-#endif
-        b->bmi.as_mode.first);
-  } else if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&
-             xd->q_index < ACTIVE_HT) {
-    const BLOCKD *b = &xd->block[ib];
-    const int ic = (ib & 10);
-#if USE_ADST_FOR_I8X8_4X4
-#if USE_ADST_PERIPHERY_ONLY
-    // Use ADST for periphery blocks only
-    const int inner = ib & 5;
-    b += ic - ib;
-    tx_type = txfm_map(pred_mode_conv(
-        (MB_PREDICTION_MODE)b->bmi.as_mode.first));
-#if USE_ADST_FOR_REMOTE_EDGE
-    if (inner == 5)
-      tx_type = DCT_DCT;
-#else
-    if (inner == 1) {
-      if (tx_type == ADST_ADST) tx_type = ADST_DCT;
-      else if (tx_type == DCT_ADST) tx_type = DCT_DCT;
-    } else if (inner == 4) {
-      if (tx_type == ADST_ADST) tx_type = DCT_ADST;
-      else if (tx_type == ADST_DCT) tx_type = DCT_DCT;
-    } else if (inner == 5) {
-      tx_type = DCT_DCT;
-    }
-#endif
-#else
-    // Use ADST
-    b += ic - ib;
-    tx_type = txfm_map(pred_mode_conv(
-        (MB_PREDICTION_MODE)b->bmi.as_mode.first));
-#endif
-#else
-    // Use 2D DCT
-    tx_type = DCT_DCT;
-#endif
-  } else if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&
-             xd->q_index < ACTIVE_HT) {
-#if USE_ADST_FOR_I16X16_4X4
-#if USE_ADST_PERIPHERY_ONLY
-    const int hmax = 4 << sb_type;
-    tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));
-#if USE_ADST_FOR_REMOTE_EDGE
-    if ((ib & (hmax - 1)) != 0 && ib >= hmax)
-      tx_type = DCT_DCT;
-#else
-    if (ib >= 1 && ib < hmax) {
-      if (tx_type == ADST_ADST) tx_type = ADST_DCT;
-      else if (tx_type == DCT_ADST) tx_type = DCT_DCT;
-    } else if (ib >= 1 && (ib & (hmax - 1)) == 0) {
-      if (tx_type == ADST_ADST) tx_type = DCT_ADST;
-      else if (tx_type == ADST_DCT) tx_type = DCT_DCT;
-    } else if (ib != 0) {
-      tx_type = DCT_DCT;
-    }
-#endif
-#else
-    // Use ADST
-    tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));
-#endif
-#else
-    // Use 2D DCT
-    tx_type = DCT_DCT;
-#endif
+  if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
+    tx_type = txfm_map(mi->bmi[ib].as_mode.first);
+  } else {
+    assert(mbmi->mode <= TM_PRED);
+    tx_type = txfm_map(mbmi->mode);
   }
   return tx_type;
 }
 
 static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, int ib) {
-  // TODO(debargha): explore different patterns for ADST usage when blocksize
-  // is smaller than the prediction size
   TX_TYPE tx_type = DCT_DCT;
-  const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
-#if !USE_ADST_FOR_SB
-  if (sb_type)
-    return tx_type;
-#endif
-  if (ib >= (16 << (2 * sb_type)))  // no chroma adst
-    return tx_type;
-  if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&
-      xd->q_index < ACTIVE_HT8) {
-    const BLOCKD *b = &xd->block[ib];
-    // TODO(rbultje): MB_PREDICTION_MODE / B_PREDICTION_MODE should be merged
-    // or the relationship otherwise modified to address this type conversion.
-    tx_type = txfm_map(pred_mode_conv(
-           (MB_PREDICTION_MODE)b->bmi.as_mode.first));
-  } else if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&
-             xd->q_index < ACTIVE_HT8) {
-#if USE_ADST_FOR_I16X16_8X8
-#if USE_ADST_PERIPHERY_ONLY
-    const int hmax = 4 << sb_type;
-    tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));
-#if USE_ADST_FOR_REMOTE_EDGE
-    if ((ib & (hmax - 1)) != 0 && ib >= hmax)
-      tx_type = DCT_DCT;
-#else
-    if (ib >= 1 && ib < hmax) {
-      if (tx_type == ADST_ADST) tx_type = ADST_DCT;
-      else if (tx_type == DCT_ADST) tx_type = DCT_DCT;
-    } else if (ib >= 1 && (ib & (hmax - 1)) == 0) {
-      if (tx_type == ADST_ADST) tx_type = DCT_ADST;
-      else if (tx_type == ADST_DCT) tx_type = DCT_DCT;
-    } else if (ib != 0) {
-      tx_type = DCT_DCT;
-    }
-#endif
-#else
-    // Use ADST
-    tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));
-#endif
-#else
-    // Use 2D DCT
-    tx_type = DCT_DCT;
-#endif
+  if (xd->mode_info_context->mbmi.mode <= TM_PRED) {
+    tx_type = txfm_map(xd->mode_info_context->mbmi.mode);
   }
   return tx_type;
 }
@@ -605,71 +553,358 @@
 
 static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, int ib) {
   TX_TYPE tx_type = DCT_DCT;
-  const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
-#if !USE_ADST_FOR_SB
-  if (sb_type)
-    return tx_type;
-#endif
-  if (ib >= (16 << (2 * sb_type)))
-    return tx_type;
-  if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&
-      xd->q_index < ACTIVE_HT16) {
-    tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));
-#if USE_ADST_PERIPHERY_ONLY
-    if (sb_type) {
-      const int hmax = 4 << sb_type;
-#if USE_ADST_FOR_REMOTE_EDGE
-      if ((ib & (hmax - 1)) != 0 && ib >= hmax)
-        tx_type = DCT_DCT;
-#else
-      if (ib >= 1 && ib < hmax) {
-        if (tx_type == ADST_ADST) tx_type = ADST_DCT;
-        else if (tx_type == DCT_ADST) tx_type = DCT_DCT;
-      } else if (ib >= 1 && (ib & (hmax - 1)) == 0) {
-        if (tx_type == ADST_ADST) tx_type = DCT_ADST;
-        else if (tx_type == ADST_DCT) tx_type = DCT_DCT;
-      } else if (ib != 0) {
-        tx_type = DCT_DCT;
+  if (xd->mode_info_context->mbmi.mode <= TM_PRED) {
+    tx_type = txfm_map(xd->mode_info_context->mbmi.mode);
+  }
+  return tx_type;
+}
+
+void vp9_setup_block_dptrs(MACROBLOCKD *xd,
+                           int subsampling_x, int subsampling_y);
+
+static TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
+  const TX_SIZE size = mbmi->txfm_size;
+
+  switch (mbmi->sb_type) {
+    case BLOCK_SIZE_SB64X64:
+      return size;
+    case BLOCK_SIZE_SB64X32:
+    case BLOCK_SIZE_SB32X64:
+    case BLOCK_SIZE_SB32X32:
+      if (size == TX_32X32)
+        return TX_16X16;
+      else
+        return size;
+    case BLOCK_SIZE_SB32X16:
+    case BLOCK_SIZE_SB16X32:
+    case BLOCK_SIZE_MB16X16:
+      if (size == TX_16X16)
+        return TX_8X8;
+      else
+        return size;
+    default:
+      return TX_4X4;
+  }
+
+  return size;
+}
+
+struct plane_block_idx {
+  int plane;
+  int block;
+};
+
+// TODO(jkoleszar): returning a struct so it can be used in a const context,
+// expect to refactor this further later.
+static INLINE struct plane_block_idx plane_block_idx(int y_blocks,
+                                                     int b_idx) {
+  const int v_offset = y_blocks * 5 / 4;
+  struct plane_block_idx res;
+
+  if (b_idx < y_blocks) {
+    res.plane = 0;
+    res.block = b_idx;
+  } else if (b_idx < v_offset) {
+    res.plane = 1;
+    res.block = b_idx - y_blocks;
+  } else {
+    assert(b_idx < y_blocks * 3 / 2);
+    res.plane = 2;
+    res.block = b_idx - v_offset;
+  }
+  return res;
+}
+
+static INLINE int plane_block_width(BLOCK_SIZE_TYPE bsize,
+                                    const struct macroblockd_plane* plane) {
+  return 4 << (b_width_log2(bsize) - plane->subsampling_x);
+}
+
+static INLINE int plane_block_height(BLOCK_SIZE_TYPE bsize,
+                                     const struct macroblockd_plane* plane) {
+  return 4 << (b_height_log2(bsize) - plane->subsampling_y);
+}
+
+typedef void (*foreach_transformed_block_visitor)(int plane, int block,
+                                                  BLOCK_SIZE_TYPE bsize,
+                                                  int ss_txfrm_size,
+                                                  void *arg);
+
+static INLINE void foreach_transformed_block_in_plane(
+    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane,
+    foreach_transformed_block_visitor visit, void *arg) {
+  const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);
+
+  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  // transform size varies per plane, look it up in a common way.
+  const MB_MODE_INFO* mbmi = &xd->mode_info_context->mbmi;
+  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi)
+                                : mbmi->txfm_size;
+  const int block_size_b = bw + bh;
+  const int txfrm_size_b = tx_size * 2;
+
+  // subsampled size of the block
+  const int ss_sum = xd->plane[plane].subsampling_x
+      + xd->plane[plane].subsampling_y;
+  const int ss_block_size = block_size_b - ss_sum;
+
+  const int step = 1 << txfrm_size_b;
+
+  int i;
+
+  assert(txfrm_size_b <= block_size_b);
+  assert(txfrm_size_b <= ss_block_size);
+
+  // If mb_to_right_edge is < 0 we are in a situation in which
+  // the current block size extends into the UMV and we won't
+  // visit the sub blocks that are wholly within the UMV.
+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
+    int r, c;
+    const int sw = bw - xd->plane[plane].subsampling_x;
+    const int sh = bh - xd->plane[plane].subsampling_y;
+    int max_blocks_wide = 1 << sw;
+    int max_blocks_high = 1 << sh;
+
+    // xd->mb_to_right_edge is in units of pixels * 8.  This converts
+    // it to 4x4 block sizes.
+    if (xd->mb_to_right_edge < 0)
+      max_blocks_wide +=
+          + (xd->mb_to_right_edge >> (5 + xd->plane[plane].subsampling_x));
+
+    if (xd->mb_to_bottom_edge < 0)
+      max_blocks_high +=
+          + (xd->mb_to_bottom_edge >> (5 + xd->plane[plane].subsampling_y));
+
+    i = 0;
+    // Unlike the normal case - in here we have to keep track of the
+    // row and column of the blocks we use so that we know if we are in
+    // the unrestricted motion border..
+    for (r = 0; r < (1 << sh); r += (1 << tx_size)) {
+      for (c = 0; c < (1 << sw); c += (1 << tx_size)) {
+        if (r < max_blocks_high && c < max_blocks_wide)
+          visit(plane, i, bsize, txfrm_size_b, arg);
+        i += step;
       }
-#endif
     }
-#endif
+  } else {
+    for (i = 0; i < (1 << ss_block_size); i += step) {
+      visit(plane, i, bsize, txfrm_size_b, arg);
+    }
   }
-  return tx_type;
 }
 
-void vp9_build_block_doffsets(MACROBLOCKD *xd);
-void vp9_setup_block_dptrs(MACROBLOCKD *xd);
+static INLINE void foreach_transformed_block(
+    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
+    foreach_transformed_block_visitor visit, void *arg) {
+  int plane;
 
-static void update_blockd_bmi(MACROBLOCKD *xd) {
-  const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
+  for (plane = 0; plane < MAX_MB_PLANE; plane++) {
+    foreach_transformed_block_in_plane(xd, bsize, plane,
+                                       visit, arg);
+  }
+}
 
-  if (mode == SPLITMV || mode == I8X8_PRED || mode == B_PRED) {
-    int i;
-    for (i = 0; i < 16; i++)
-      xd->block[i].bmi = xd->mode_info_context->bmi[i];
+static INLINE void foreach_transformed_block_uv(
+    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
+    foreach_transformed_block_visitor visit, void *arg) {
+  int plane;
+
+  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
+    foreach_transformed_block_in_plane(xd, bsize, plane,
+                                       visit, arg);
   }
 }
 
-static TX_SIZE get_uv_tx_size(const MACROBLOCKD *xd) {
-  TX_SIZE tx_size_uv;
-  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
-    tx_size_uv = xd->mode_info_context->mbmi.txfm_size;
-  } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {
-    if (xd->mode_info_context->mbmi.txfm_size == TX_32X32)
-      tx_size_uv = TX_16X16;
-    else
-      tx_size_uv = xd->mode_info_context->mbmi.txfm_size;
+// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
+// calculate the subsampled BLOCK_SIZE_TYPE, but that type isn't defined for
+// sizes smaller than 16x16 yet.
+typedef void (*foreach_predicted_block_visitor)(int plane, int block,
+                                                BLOCK_SIZE_TYPE bsize,
+                                                int pred_w, int pred_h,
+                                                void *arg);
+static INLINE void foreach_predicted_block_in_plane(
+    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane,
+    foreach_predicted_block_visitor visit, void *arg) {
+  int i, x, y;
+
+  // block sizes in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  // subsampled size of the block
+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
+
+  // size of the predictor to use.
+  int pred_w, pred_h;
+
+  if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+    assert(bsize == BLOCK_SIZE_SB8X8);
+    pred_w = 0;
+    pred_h = 0;
   } else {
-    if (xd->mode_info_context->mbmi.txfm_size == TX_16X16)
-      tx_size_uv = TX_8X8;
-    else if (xd->mode_info_context->mbmi.txfm_size == TX_8X8 &&
-             (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
-              xd->mode_info_context->mbmi.mode == SPLITMV))
-      tx_size_uv = TX_4X4;
-    else
-      tx_size_uv = xd->mode_info_context->mbmi.txfm_size;
+    pred_w = bwl;
+    pred_h = bhl;
   }
-  return tx_size_uv;
+  assert(pred_w <= bwl);
+  assert(pred_h <= bhl);
+
+  // visit each subblock in raster order
+  i = 0;
+  for (y = 0; y < 1 << bhl; y += 1 << pred_h) {
+    for (x = 0; x < 1 << bwl; x += 1 << pred_w) {
+      visit(plane, i, bsize, pred_w, pred_h, arg);
+      i += 1 << pred_w;
+    }
+    i += (1 << (bwl + pred_h)) - (1 << bwl);
+  }
 }
+static INLINE void foreach_predicted_block(
+    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
+    foreach_predicted_block_visitor visit, void *arg) {
+  int plane;
+
+  for (plane = 0; plane < MAX_MB_PLANE; plane++) {
+    foreach_predicted_block_in_plane(xd, bsize, plane, visit, arg);
+  }
+}
+static INLINE void foreach_predicted_block_uv(
+    const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
+    foreach_predicted_block_visitor visit, void *arg) {
+  int plane;
+
+  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
+    foreach_predicted_block_in_plane(xd, bsize, plane, visit, arg);
+  }
+}
+static int raster_block_offset(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
+                               int plane, int block, int stride) {
+  const int bw = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int y = 4 * (block >> bw), x = 4 * (block & ((1 << bw) - 1));
+  return y * stride + x;
+}
+static int16_t* raster_block_offset_int16(MACROBLOCKD *xd,
+                                         BLOCK_SIZE_TYPE bsize,
+                                         int plane, int block, int16_t *base) {
+  const int stride = plane_block_width(bsize, &xd->plane[plane]);
+  return base + raster_block_offset(xd, bsize, plane, block, stride);
+}
+static uint8_t* raster_block_offset_uint8(MACROBLOCKD *xd,
+                                         BLOCK_SIZE_TYPE bsize,
+                                         int plane, int block,
+                                         uint8_t *base, int stride) {
+  return base + raster_block_offset(xd, bsize, plane, block, stride);
+}
+
+static int txfrm_block_to_raster_block(MACROBLOCKD *xd,
+                                       BLOCK_SIZE_TYPE bsize,
+                                       int plane, int block,
+                                       int ss_txfrm_size) {
+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int txwl = ss_txfrm_size / 2;
+  const int tx_cols_lg2 = bwl - txwl;
+  const int tx_cols = 1 << tx_cols_lg2;
+  const int raster_mb = block >> ss_txfrm_size;
+  const int x = (raster_mb & (tx_cols - 1)) << (txwl);
+  const int y = raster_mb >> tx_cols_lg2 << (txwl);
+  return x + (y << bwl);
+}
+
+static void txfrm_block_to_raster_xy(MACROBLOCKD *xd,
+                                     BLOCK_SIZE_TYPE bsize,
+                                     int plane, int block,
+                                     int ss_txfrm_size,
+                                     int *x, int *y) {
+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int txwl = ss_txfrm_size / 2;
+  const int tx_cols_lg2 = bwl - txwl;
+  const int tx_cols = 1 << tx_cols_lg2;
+  const int raster_mb = block >> ss_txfrm_size;
+  *x = (raster_mb & (tx_cols - 1)) << (txwl);
+  *y = raster_mb >> tx_cols_lg2 << (txwl);
+}
+
+static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block,
+                             BLOCK_SIZE_TYPE bsize, int ss_txfrm_size) {
+  const int bw = plane_block_width(bsize, &xd->plane[plane]);
+  const int bh = plane_block_height(bsize, &xd->plane[plane]);
+  int x, y;
+  txfrm_block_to_raster_xy(xd, bsize, plane, block, ss_txfrm_size, &x, &y);
+  x = x * 4 - 1;
+  y = y * 4 - 1;
+  // Copy a pixel into the umv if we are in a situation where the block size
+  // extends into the UMV.
+  // TODO(JBB): Should be able to do the full extend in place so we don't have
+  // to do this multiple times.
+  if (xd->mb_to_right_edge < 0) {
+    int umv_border_start = bw
+        + (xd->mb_to_right_edge >> (3 + xd->plane[plane].subsampling_x));
+
+    if (x + bw > umv_border_start)
+      vpx_memset(
+          xd->plane[plane].dst.buf + y * xd->plane[plane].dst.stride
+              + umv_border_start,
+          *(xd->plane[plane].dst.buf + y * xd->plane[plane].dst.stride
+              + umv_border_start - 1),
+          bw);
+  }
+  if (xd->mb_to_bottom_edge < 0) {
+    int umv_border_start = bh
+        + (xd->mb_to_bottom_edge >> (3 + xd->plane[plane].subsampling_y));
+    int i;
+    uint8_t c = *(xd->plane[plane].dst.buf
+        + (umv_border_start - 1) * xd->plane[plane].dst.stride + x);
+
+    uint8_t *d = xd->plane[plane].dst.buf
+        + umv_border_start * xd->plane[plane].dst.stride + x;
+
+    if (y + bh > umv_border_start)
+      for (i = 0; i < bh; i++, d += xd->plane[plane].dst.stride)
+        *d = c;
+  }
+}
+static void set_contexts_on_border(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
+                                   int plane, int ss_tx_size, int eob, int aoff,
+                                   int loff, ENTROPY_CONTEXT *A,
+                                   ENTROPY_CONTEXT *L) {
+  const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);
+  const int sw = bw - xd->plane[plane].subsampling_x;
+  const int sh = bh - xd->plane[plane].subsampling_y;
+  int mi_blocks_wide = 1 << sw;
+  int mi_blocks_high = 1 << sh;
+  int tx_size_in_blocks = (1 << ss_tx_size);
+  int above_contexts = tx_size_in_blocks;
+  int left_contexts = tx_size_in_blocks;
+  int pt;
+
+  // xd->mb_to_right_edge is in units of pixels * 8.  This converts
+  // it to 4x4 block sizes.
+  if (xd->mb_to_right_edge < 0) {
+    mi_blocks_wide += (xd->mb_to_right_edge
+        >> (5 + xd->plane[plane].subsampling_x));
+  }
+
+  // this code attempts to avoid copying into contexts that are outside
+  // our border.  Any blocks that do are set to 0...
+  if (above_contexts + aoff > mi_blocks_wide)
+    above_contexts = mi_blocks_wide - aoff;
+
+  if (xd->mb_to_bottom_edge < 0) {
+    mi_blocks_high += (xd->mb_to_bottom_edge
+        >> (5 + xd->plane[plane].subsampling_y));
+  }
+  if (left_contexts + loff > mi_blocks_high) {
+    left_contexts = mi_blocks_high - loff;
+  }
+
+  for (pt = 0; pt < above_contexts; pt++)
+    A[pt] = eob > 0;
+  for (pt = above_contexts; pt < (1 << ss_tx_size); pt++)
+    A[pt] = 0;
+  for (pt = 0; pt < left_contexts; pt++)
+    L[pt] = eob > 0;
+  for (pt = left_contexts; pt < (1 << ss_tx_size); pt++)
+    L[pt] = 0;
+}
+
+
 #endif  // VP9_COMMON_VP9_BLOCKD_H_
--- a/vp9/common/vp9_coefupdateprobs.h
+++ b/vp9/common/vp9_coefupdateprobs.h
@@ -14,20 +14,8 @@
 /* Update probabilities for the nodes in the token entropy tree.
    Generated file included by vp9_entropy.c */
 
-static const vp9_prob vp9_coef_update_prob[ENTROPY_NODES] = {
-  252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252
+static const vp9_prob vp9_coef_update_prob[UNCONSTRAINED_NODES] = {
+  252, 252, 252,
 };
-
-#if CONFIG_CODE_NONZEROCOUNT
-#define NZC_UPDATE_PROB_4X4     252
-#define NZC_UPDATE_PROB_8X8     252
-#define NZC_UPDATE_PROB_16X16   252
-#define NZC_UPDATE_PROB_32X32   252
-#define NZC_UPDATE_PROB_PCAT    252
-#endif
-
-#if CONFIG_MODELCOEFPROB
-#define COEF_MODEL_UPDATE_PROB   16
-#endif
 
 #endif  // VP9_COMMON_VP9_COEFUPDATEPROBS_H__
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -19,9 +19,6 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx/vpx_integer.h"
 
-#define TRUE    1
-#define FALSE   0
-
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 #define MAX(x, y) (((x) > (y)) ? (x) : (y))
 
@@ -54,5 +51,18 @@
 static INLINE int clamp(int value, int low, int high) {
   return value < low ? low : (value > high ? high : value);
 }
+
+static INLINE double fclamp(double value, double low, double high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE int multiple8(int value) {
+  return (value + 7) & ~7;
+}
+
+#define SYNC_CODE_0 0x49
+#define SYNC_CODE_1 0x83
+#define SYNC_CODE_2 0x42
+
 
 #endif  // VP9_COMMON_VP9_COMMON_H_
--- a/vp9/common/vp9_context.c
+++ /dev/null
@@ -1,397 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9/common/vp9_entropy.h"
-
-/* *** GENERATED FILE: DO NOT EDIT *** */
-
-#if 0
-int Contexts[vp8_coef_counter_dimen];
-
-const int default_contexts[vp8_coef_counter_dimen] = {
-  {
-    // Block Type ( 0 )
-    {
-      // Coeff Band ( 0 )
-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-    },
-    {
-      // Coeff Band ( 1 )
-      {30190, 26544, 225,  24,   4,   0,   0,   0,   0,   0,   0, 4171593},
-      {26846, 25157, 1241, 130,  26,   6,   1,   0,   0,   0,   0, 149987},
-      {10484, 9538, 1006, 160,  36,  18,   0,   0,   0,   0,   0, 15104},
-    },
-    {
-      // Coeff Band ( 2 )
-      {25842, 40456, 1126,  83,  11,   2,   0,   0,   0,   0,   0,   0},
-      {9338, 8010, 512,  73,   7,   3,   2,   0,   0,   0,   0, 43294},
-      {1047, 751, 149,  31,  13,   6,   1,   0,   0,   0,   0, 879},
-    },
-    {
-      // Coeff Band ( 3 )
-      {26136, 9826, 252,  13,   0,   0,   0,   0,   0,   0,   0,   0},
-      {8134, 5574, 191,  14,   2,   0,   0,   0,   0,   0,   0, 35302},
-      { 605, 677, 116,   9,   1,   0,   0,   0,   0,   0,   0, 611},
-    },
-    {
-      // Coeff Band ( 4 )
-      {10263, 15463, 283,  17,   0,   0,   0,   0,   0,   0,   0,   0},
-      {2773, 2191, 128,   9,   2,   2,   0,   0,   0,   0,   0, 10073},
-      { 134, 125,  32,   4,   0,   2,   0,   0,   0,   0,   0,  50},
-    },
-    {
-      // Coeff Band ( 5 )
-      {10483, 2663,  23,   1,   0,   0,   0,   0,   0,   0,   0,   0},
-      {2137, 1251,  27,   1,   1,   0,   0,   0,   0,   0,   0, 14362},
-      { 116, 156,  14,   2,   1,   0,   0,   0,   0,   0,   0, 190},
-    },
-    {
-      // Coeff Band ( 6 )
-      {40977, 27614, 412,  28,   0,   0,   0,   0,   0,   0,   0,   0},
-      {6113, 5213, 261,  22,   3,   0,   0,   0,   0,   0,   0, 26164},
-      { 382, 312,  50,  14,   2,   0,   0,   0,   0,   0,   0, 345},
-    },
-    {
-      // Coeff Band ( 7 )
-      {   0,  26,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-      {   0,  13,   0,   0,   0,   0,   0,   0,   0,   0,   0, 319},
-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   8},
-    },
-  },
-  {
-    // Block Type ( 1 )
-    {
-      // Coeff Band ( 0 )
-      {3268, 19382, 1043, 250,  93,  82,  49,  26,  17,   8,  25, 82289},
-      {8758, 32110, 5436, 1832, 827, 668, 420, 153,  24,   0,   3, 52914},
-      {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399,  59,   0,   0, 18620},
-    },
-    {
-      // Coeff Band ( 1 )
-      {12419, 8420, 452,  62,   9,   1,   0,   0,   0,   0,   0,   0},
-      {11715, 8705, 693,  92,  15,   7,   2,   0,   0,   0,   0, 53988},
-      {7603, 8585, 2306, 778, 270, 145,  39,   5,   0,   0,   0, 9136},
-    },
-    {
-      // Coeff Band ( 2 )
-      {15938, 14335, 1207, 184,  55,  13,   4,   1,   0,   0,   0,   0},
-      {7415, 6829, 1138, 244,  71,  26,   7,   0,   0,   0,   0, 9980},
-      {1580, 1824, 655, 241,  89,  46,  10,   2,   0,   0,   0, 429},
-    },
-    {
-      // Coeff Band ( 3 )
-      {19453, 5260, 201,  19,   0,   0,   0,   0,   0,   0,   0,   0},
-      {9173, 3758, 213,  22,   1,   1,   0,   0,   0,   0,   0, 9820},
-      {1689, 1277, 276,  51,  17,   4,   0,   0,   0,   0,   0, 679},
-    },
-    {
-      // Coeff Band ( 4 )
-      {12076, 10667, 620,  85,  19,   9,   5,   0,   0,   0,   0,   0},
-      {4665, 3625, 423,  55,  19,   9,   0,   0,   0,   0,   0, 5127},
-      { 415, 440, 143,  34,  20,   7,   2,   0,   0,   0,   0, 101},
-    },
-    {
-      // Coeff Band ( 5 )
-      {12183, 4846, 115,  11,   1,   0,   0,   0,   0,   0,   0,   0},
-      {4226, 3149, 177,  21,   2,   0,   0,   0,   0,   0,   0, 7157},
-      { 375, 621, 189,  51,  11,   4,   1,   0,   0,   0,   0, 198},
-    },
-    {
-      // Coeff Band ( 6 )
-      {61658, 37743, 1203,  94,  10,   3,   0,   0,   0,   0,   0,   0},
-      {15514, 11563, 903, 111,  14,   5,   0,   0,   0,   0,   0, 25195},
-      { 929, 1077, 291,  78,  14,   7,   1,   0,   0,   0,   0, 507},
-    },
-    {
-      // Coeff Band ( 7 )
-      {   0, 990,  15,   3,   0,   0,   0,   0,   0,   0,   0,   0},
-      {   0, 412,  13,   0,   0,   0,   0,   0,   0,   0,   0, 1641},
-      {   0,  18,   7,   1,   0,   0,   0,   0,   0,   0,   0,  30},
-    },
-  },
-  {
-    // Block Type ( 2 )
-    {
-      // Coeff Band ( 0 )
-      { 953, 24519, 628, 120,  28,  12,   4,   0,   0,   0,   0, 2248798},
-      {1525, 25654, 2647, 617, 239, 143,  42,   5,   0,   0,   0, 66837},
-      {1180, 11011, 3001, 1237, 532, 448, 239,  54,   5,   0,   0, 7122},
-    },
-    {
-      // Coeff Band ( 1 )
-      {1356, 2220,  67,  10,   4,   1,   0,   0,   0,   0,   0,   0},
-      {1450, 2544, 102,  18,   4,   3,   0,   0,   0,   0,   0, 57063},
-      {1182, 2110, 470, 130,  41,  21,   0,   0,   0,   0,   0, 6047},
-    },
-    {
-      // Coeff Band ( 2 )
-      { 370, 3378, 200,  30,   5,   4,   1,   0,   0,   0,   0,   0},
-      { 293, 1006, 131,  29,  11,   0,   0,   0,   0,   0,   0, 5404},
-      { 114, 387,  98,  23,   4,   8,   1,   0,   0,   0,   0, 236},
-    },
-    {
-      // Coeff Band ( 3 )
-      { 579, 194,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-      { 395, 213,   5,   1,   0,   0,   0,   0,   0,   0,   0, 4157},
-      { 119, 122,   4,   0,   0,   0,   0,   0,   0,   0,   0, 300},
-    },
-    {
-      // Coeff Band ( 4 )
-      {  38, 557,  19,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-      {  21, 114,  12,   1,   0,   0,   0,   0,   0,   0,   0, 427},
-      {   0,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   7},
-    },
-    {
-      // Coeff Band ( 5 )
-      {  52,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-      {  18,   6,   0,   0,   0,   0,   0,   0,   0,   0,   0, 652},
-      {   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  30},
-    },
-    {
-      // Coeff Band ( 6 )
-      { 640, 569,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-      {  25,  77,   2,   0,   0,   0,   0,   0,   0,   0,   0, 517},
-      {   4,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3},
-    },
-    {
-      // Coeff Band ( 7 )
-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-      {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-    },
-  },
-  {
-    // Block Type ( 3 )
-    {
-      // Coeff Band ( 0 )
-      {2506, 20161, 2707, 767, 261, 178, 107,  30,  14,   3,   0, 100694},
-      {8806, 36478, 8817, 3268, 1280, 850, 401, 114,  42,   0,   0, 58572},
-      {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175,  32,   0,   0, 19284},
-    },
-    {
-      // Coeff Band ( 1 )
-      {9738, 11313, 959, 205,  70,  18,  11,   1,   0,   0,   0,   0},
-      {12628, 15085, 1507, 273,  52,  19,   9,   0,   0,   0,   0, 54280},
-      {10701, 15846, 5561, 1926, 813, 570, 249,  36,   0,   0,   0, 6460},
-    },
-    {
-      // Coeff Band ( 2 )
-      {6781, 22539, 2784, 634, 182, 123,  20,   4,   0,   0,   0,   0},
-      {6263, 11544, 2649, 790, 259, 168,  27,   5,   0,   0,   0, 20539},
-      {3109, 4075, 2031, 896, 457, 386, 158,  29,   0,   0,   0, 1138},
-    },
-    {
-      // Coeff Band ( 3 )
-      {11515, 4079, 465,  73,   5,  14,   2,   0,   0,   0,   0,   0},
-      {9361, 5834, 650,  96,  24,   8,   4,   0,   0,   0,   0, 22181},
-      {4343, 3974, 1360, 415, 132,  96,  14,   1,   0,   0,   0, 1267},
-    },
-    {
-      // Coeff Band ( 4 )
-      {4787, 9297, 823, 168,  44,  12,   4,   0,   0,   0,   0,   0},
-      {3619, 4472, 719, 198,  60,  31,   3,   0,   0,   0,   0, 8401},
-      {1157, 1175, 483, 182,  88,  31,   8,   0,   0,   0,   0, 268},
-    },
-    {
-      // Coeff Band ( 5 )
-      {8299, 1226,  32,   5,   1,   0,   0,   0,   0,   0,   0,   0},
-      {3502, 1568,  57,   4,   1,   1,   0,   0,   0,   0,   0, 9811},
-      {1055, 1070, 166,  29,   6,   1,   0,   0,   0,   0,   0, 527},
-    },
-    {
-      // Coeff Band ( 6 )
-      {27414, 27927, 1989, 347,  69,  26,   0,   0,   0,   0,   0,   0},
-      {5876, 10074, 1574, 341,  91,  24,   4,   0,   0,   0,   0, 21954},
-      {1571, 2171, 778, 324, 124,  65,  16,   0,   0,   0,   0, 979},
-    },
-    {
-      // Coeff Band ( 7 )
-      {   0,  29,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0},
-      {   0,  23,   0,   0,   0,   0,   0,   0,   0,   0,   0, 459},
-      {   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  13},
-    },
-  },
-};
-
-// Update probabilities for the nodes in the token entropy tree.
-const vp9_prob tree_update_probs[vp9_coef_tree_dimen] = {
-  {
-    {
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },
-      {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },
-      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-  },
-  {
-    {
-      {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },
-      {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },
-    },
-    {
-      {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-  },
-  {
-    {
-      {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },
-      {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },
-    },
-    {
-      {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-  },
-  {
-    {
-      {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },
-      {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-    {
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
-    },
-  },
-};
-#endif
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -122,78 +122,6 @@
   }
 }
 
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-
-static inline uint8_t combine_qtr(uint8_t a, uint8_t b) {
-  return (((a) + (b) * 3 + 2) >> 2);
-}
-
-static inline uint8_t combine_3qtr(uint8_t a, uint8_t b) {
-  return (((a) * 3 + (b) + 2) >> 2);
-}
-
-static inline uint8_t combine_1by8(uint8_t a, uint8_t b) {
-  return (((a) * 1 + (b) * 7 + 4) >> 3);
-}
-
-static inline uint8_t combine_3by8(uint8_t a, uint8_t b) {
-  return (((a) * 3 + (b) * 5 + 4) >> 3);
-}
-
-static inline uint8_t combine_5by8(uint8_t a, uint8_t b) {
-  return (((a) * 5 + (b) * 3 + 4) >> 3);
-}
-
-static inline uint8_t combine_7by8(uint8_t a, uint8_t b) {
-  return (((a) * 7 + (b) * 1 + 4) >> 3);
-}
-
-// TODO(debargha): Implment with a separate weight parameter
-static void convolve_wtd_horiz_c(const uint8_t *src, int src_stride,
-                                 uint8_t *dst, int dst_stride,
-                                 const int16_t *filter_x0, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4,
-                                 int w, int h, int taps,
-                                 uint8_t (*combine)(uint8_t a, uint8_t b)) {
-  int x, y, k, sum;
-  const int16_t *filter_x_base = filter_x0;
-
-#if ALIGN_FILTERS_256
-  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
-#endif
-
-  /* Adjust base pointer address for this source line */
-  src -= taps / 2 - 1;
-
-  for (y = 0; y < h; ++y) {
-    /* Pointer to filter to use */
-    const int16_t *filter_x = filter_x0;
-
-    /* Initial phase offset */
-    int x0_q4 = (filter_x - filter_x_base) / taps;
-    int x_q4 = x0_q4;
-
-    for (x = 0; x < w; ++x) {
-      /* Per-pixel src offset */
-      int src_x = (x_q4 - x0_q4) >> 4;
-
-      for (sum = 0, k = 0; k < taps; ++k) {
-        sum += src[src_x + k] * filter_x[k];
-      }
-      sum += (VP9_FILTER_WEIGHT >> 1);
-      dst[x] = combine(dst[x], clip_pixel(sum >> VP9_FILTER_SHIFT));
-
-      /* Adjust source and filter to use for the next pixel */
-      x_q4 += x_step_q4;
-      filter_x = filter_x_base + (x_q4 & 0xf) * taps;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-#endif
-
 static void convolve_vert_c(const uint8_t *src, int src_stride,
                             uint8_t *dst, int dst_stride,
                             const int16_t *filter_x, int x_step_q4,
@@ -279,52 +207,6 @@
   }
 }
 
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-static void convolve_wtd_vert_c(const uint8_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y0, int y_step_q4,
-                                int w, int h, int taps,
-                                uint8_t (*combine)(uint8_t a, uint8_t b)) {
-  int x, y, k, sum;
-
-  const int16_t *filter_y_base = filter_y0;
-
-#if ALIGN_FILTERS_256
-  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
-#endif
-
-  /* Adjust base pointer address for this source column */
-  src -= src_stride * (taps / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    /* Pointer to filter to use */
-    const int16_t *filter_y = filter_y0;
-
-    /* Initial phase offset */
-    int y0_q4 = (filter_y - filter_y_base) / taps;
-    int y_q4 = y0_q4;
-
-    for (y = 0; y < h; ++y) {
-      /* Per-pixel src offset */
-      int src_y = (y_q4 - y0_q4) >> 4;
-
-      for (sum = 0, k = 0; k < taps; ++k) {
-        sum += src[(src_y + k) * src_stride] * filter_y[k];
-      }
-      sum += (VP9_FILTER_WEIGHT >> 1);
-      dst[y * dst_stride] = combine(dst[y * dst_stride],
-                                    clip_pixel(sum >> VP9_FILTER_SHIFT));
-
-      /* Adjust source and filter to use for the next pixel */
-      y_q4 += y_step_q4;
-      filter_y = filter_y_base + (y_q4 & 0xf) * taps;
-    }
-    ++src;
-    ++dst;
-  }
-}
-#endif
-
 static void convolve_c(const uint8_t *src, int src_stride,
                        uint8_t *dst, int dst_stride,
                        const int16_t *filter_x, int x_step_q4,
@@ -331,14 +213,14 @@
                        const int16_t *filter_y, int y_step_q4,
                        int w, int h, int taps) {
   /* Fixed size intermediate buffer places limits on parameters.
-   * Maximum intermediate_height is 39, for y_step_q4 == 32,
-   * h == 16, taps == 8.
+   * Maximum intermediate_height is 135, for y_step_q4 == 32,
+   * h == 64, taps == 8.
    */
-  uint8_t temp[16 * 39];
+  uint8_t temp[64 * 135];
   int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1;
 
-  assert(w <= 16);
-  assert(h <= 16);
+  assert(w <= 64);
+  assert(h <= 64);
   assert(taps <= 8);
   assert(y_step_q4 <= 32);
 
@@ -346,10 +228,10 @@
     intermediate_height = h;
 
   convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,
-                   temp, 16,
+                   temp, 64,
                    filter_x, x_step_q4, filter_y, y_step_q4,
                    w, intermediate_height, taps);
-  convolve_vert_c(temp + 16 * (taps / 2 - 1), 16, dst, dst_stride,
+  convolve_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride,
                   filter_x, x_step_q4, filter_y, y_step_q4,
                   w, h, taps);
 }
@@ -360,14 +242,14 @@
                            const int16_t *filter_y, int y_step_q4,
                            int w, int h, int taps) {
   /* Fixed size intermediate buffer places limits on parameters.
-   * Maximum intermediate_height is 39, for y_step_q4 == 32,
-   * h == 16, taps == 8.
+   * Maximum intermediate_height is 135, for y_step_q4 == 32,
+   * h == 64, taps == 8.
    */
-  uint8_t temp[16 * 39];
+  uint8_t temp[64 * 135];
   int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1;
 
-  assert(w <= 16);
-  assert(h <= 16);
+  assert(w <= 64);
+  assert(h <= 64);
   assert(taps <= 8);
   assert(y_step_q4 <= 32);
 
@@ -375,10 +257,10 @@
     intermediate_height = h;
 
   convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,
-                   temp, 16,
+                   temp, 64,
                    filter_x, x_step_q4, filter_y, y_step_q4,
                    w, intermediate_height, taps);
-  convolve_avg_vert_c(temp + 16 * (taps / 2 - 1), 16, dst, dst_stride,
+  convolve_avg_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride,
                       filter_x, x_step_q4, filter_y, y_step_q4,
                       w, h, taps);
 }
@@ -403,68 +285,6 @@
                        w, h, 8);
 }
 
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-void vp9_convolve8_1by8_horiz_c(const uint8_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4,
-                                int w, int h) {
-  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,
-                       filter_x, x_step_q4, filter_y, y_step_q4,
-                       w, h, 8, combine_1by8);
-}
-
-void vp9_convolve8_qtr_horiz_c(const uint8_t *src, int src_stride,
-                               uint8_t *dst, int dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,
-                       filter_x, x_step_q4, filter_y, y_step_q4,
-                       w, h, 8, combine_qtr);
-}
-
-void vp9_convolve8_3by8_horiz_c(const uint8_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4,
-                                int w, int h) {
-  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,
-                       filter_x, x_step_q4, filter_y, y_step_q4,
-                       w, h, 8, combine_3by8);
-}
-
-void vp9_convolve8_5by8_horiz_c(const uint8_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4,
-                                int w, int h) {
-  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,
-                       filter_x, x_step_q4, filter_y, y_step_q4,
-                       w, h, 8, combine_5by8);
-}
-
-void vp9_convolve8_3qtr_horiz_c(const uint8_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4,
-                                int w, int h) {
-  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,
-                       filter_x, x_step_q4, filter_y, y_step_q4,
-                       w, h, 8, combine_3qtr);
-}
-
-void vp9_convolve8_7by8_horiz_c(const uint8_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4,
-                                int w, int h) {
-  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,
-                       filter_x, x_step_q4, filter_y, y_step_q4,
-                       w, h, 8, combine_7by8);
-}
-#endif
-
 void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,
                           uint8_t *dst, int dst_stride,
                           const int16_t *filter_x, int x_step_q4,
@@ -485,68 +305,6 @@
                       w, h, 8);
 }
 
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-void vp9_convolve8_1by8_vert_c(const uint8_t *src, int src_stride,
-                               uint8_t *dst, int dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h, 8, combine_1by8);
-}
-
-void vp9_convolve8_qtr_vert_c(const uint8_t *src, int src_stride,
-                              uint8_t *dst, int dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4,
-                              int w, int h) {
-  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h, 8, combine_qtr);
-}
-
-void vp9_convolve8_3by8_vert_c(const uint8_t *src, int src_stride,
-                               uint8_t *dst, int dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h, 8, combine_3by8);
-}
-
-void vp9_convolve8_5by8_vert_c(const uint8_t *src, int src_stride,
-                               uint8_t *dst, int dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h, 8, combine_5by8);
-}
-
-void vp9_convolve8_3qtr_vert_c(const uint8_t *src, int src_stride,
-                               uint8_t *dst, int dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h, 8, combine_3qtr);
-}
-
-void vp9_convolve8_7by8_vert_c(const uint8_t *src, int src_stride,
-                               uint8_t *dst, int dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4,
-                               int w, int h) {
-  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h, 8, combine_7by8);
-}
-#endif
-
 void vp9_convolve8_c(const uint8_t *src, int src_stride,
                      uint8_t *dst, int dst_stride,
                      const int16_t *filter_x, int x_step_q4,
@@ -563,16 +321,16 @@
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
   /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);
-  assert(w <= 16);
-  assert(h <= 16);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 64 * 64);
+  assert(w <= 64);
+  assert(h <= 64);
 
   vp9_convolve8(src, src_stride,
-                temp, 16,
+                temp, 64,
                 filter_x, x_step_q4,
                 filter_y, y_step_q4,
                 w, h);
-  vp9_convolve_avg(temp, 16,
+  vp9_convolve_avg(temp, 64,
                    dst, dst_stride,
                    NULL, 0, /* These unused parameter should be removed! */
                    NULL, 0, /* These unused parameter should be removed! */
@@ -579,140 +337,6 @@
                    w, h);
 }
 
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-void vp9_convolve8_1by8_c(const uint8_t *src, int src_stride,
-                         uint8_t *dst, int dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);
-  assert(w <= 16);
-  assert(h <= 16);
-
-  vp9_convolve8(src, src_stride,
-                temp, 16,
-                filter_x, x_step_q4,
-                filter_y, y_step_q4,
-                w, h);
-  vp9_convolve_1by8(temp, 16,
-                    dst, dst_stride,
-                    NULL, 0, /* These unused parameter should be removed! */
-                    NULL, 0, /* These unused parameter should be removed! */
-                    w, h);
-}
-
-void vp9_convolve8_qtr_c(const uint8_t *src, int src_stride,
-                         uint8_t *dst, int dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);
-  assert(w <= 16);
-  assert(h <= 16);
-
-  vp9_convolve8(src, src_stride,
-                temp, 16,
-                filter_x, x_step_q4,
-                filter_y, y_step_q4,
-                w, h);
-  vp9_convolve_qtr(temp, 16,
-                   dst, dst_stride,
-                   NULL, 0, /* These unused parameter should be removed! */
-                   NULL, 0, /* These unused parameter should be removed! */
-                   w, h);
-}
-
-void vp9_convolve8_3by8_c(const uint8_t *src, int src_stride,
-                         uint8_t *dst, int dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);
-  assert(w <= 16);
-  assert(h <= 16);
-
-  vp9_convolve8(src, src_stride,
-                temp, 16,
-                filter_x, x_step_q4,
-                filter_y, y_step_q4,
-                w, h);
-  vp9_convolve_3by8(temp, 16,
-                    dst, dst_stride,
-                    NULL, 0, /* These unused parameter should be removed! */
-                    NULL, 0, /* These unused parameter should be removed! */
-                    w, h);
-}
-
-void vp9_convolve8_5by8_c(const uint8_t *src, int src_stride,
-                         uint8_t *dst, int dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);
-  assert(w <= 16);
-  assert(h <= 16);
-
-  vp9_convolve8(src, src_stride,
-                temp, 16,
-                filter_x, x_step_q4,
-                filter_y, y_step_q4,
-                w, h);
-  vp9_convolve_5by8(temp, 16,
-                    dst, dst_stride,
-                    NULL, 0, /* These unused parameter should be removed! */
-                    NULL, 0, /* These unused parameter should be removed! */
-                    w, h);
-}
-
-void vp9_convolve8_3qtr_c(const uint8_t *src, int src_stride,
-                          uint8_t *dst, int dst_stride,
-                          const int16_t *filter_x, int x_step_q4,
-                          const int16_t *filter_y, int y_step_q4,
-                          int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);
-  assert(w <= 16);
-  assert(h <= 16);
-
-  vp9_convolve8(src, src_stride,
-                temp, 16,
-                filter_x, x_step_q4,
-                filter_y, y_step_q4,
-                w, h);
-  vp9_convolve_3qtr(temp, 16,
-                    dst, dst_stride,
-                    NULL, 0, /* These unused parameter should be removed! */
-                    NULL, 0, /* These unused parameter should be removed! */
-                    w, h);
-}
-
-void vp9_convolve8_7by8_c(const uint8_t *src, int src_stride,
-                         uint8_t *dst, int dst_stride,
-                         const int16_t *filter_x, int x_step_q4,
-                         const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);
-  assert(w <= 16);
-  assert(h <= 16);
-
-  vp9_convolve8(src, src_stride,
-                temp, 16,
-                filter_x, x_step_q4,
-                filter_y, y_step_q4,
-                w, h);
-  vp9_convolve_7by8(temp, 16,
-                    dst, dst_stride,
-                    NULL, 0, /* These unused parameter should be removed! */
-                    NULL, 0, /* These unused parameter should be removed! */
-                    w, h);
-}
-#endif
-
 void vp9_convolve_copy(const uint8_t *src, int src_stride,
                        uint8_t *dst, int dst_stride,
                        const int16_t *filter_x, int filter_x_stride,
@@ -750,101 +374,3 @@
     dst += dst_stride;
   }
 }
-
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-void vp9_convolve_1by8(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int filter_x_stride,
-                       const int16_t *filter_y, int filter_y_stride,
-                       int w, int h) {
-  int x, y;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x] = combine_1by8(dst[x], src[x]);
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_convolve_qtr(const uint8_t *src, int src_stride,
-                      uint8_t *dst, int dst_stride,
-                      const int16_t *filter_x, int filter_x_stride,
-                      const int16_t *filter_y, int filter_y_stride,
-                      int w, int h) {
-  int x, y;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x] = combine_qtr(dst[x], src[x]);
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_convolve_3by8(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int filter_x_stride,
-                       const int16_t *filter_y, int filter_y_stride,
-                       int w, int h) {
-  int x, y;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x] = combine_3by8(dst[x], src[x]);
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_convolve_5by8(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int filter_x_stride,
-                       const int16_t *filter_y, int filter_y_stride,
-                       int w, int h) {
-  int x, y;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x] = combine_5by8(dst[x], src[x]);
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_convolve_3qtr(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int filter_x_stride,
-                       const int16_t *filter_y, int filter_y_stride,
-                       int w, int h) {
-  int x, y;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x] = combine_3qtr(dst[x], src[x]);
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_convolve_7by8(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int filter_x_stride,
-                       const int16_t *filter_y, int filter_y_stride,
-                       int w, int h) {
-  int x, y;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      dst[x] = combine_7by8(dst[x], src[x]);
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-#endif
--- a/vp9/common/vp9_convolve.h
+++ b/vp9/common/vp9_convolve.h
@@ -33,50 +33,6 @@
                       const int16_t *filter_y, int y_step_q4,
                       int w, int h);
 
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-// Not a convolution, a block wtd (1/8, 7/8) average for (dst, src)
-void vp9_convolve_1by8(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int x_step_q4,
-                       const int16_t *filter_y, int y_step_q4,
-                       int w, int h);
-
-// Not a convolution, a block wtd (1/4, 3/4) average for (dst, src)
-void vp9_convolve_qtr(const uint8_t *src, int src_stride,
-                      uint8_t *dst, int dst_stride,
-                      const int16_t *filter_x, int x_step_q4,
-                      const int16_t *filter_y, int y_step_q4,
-                      int w, int h);
-
-// Not a convolution, a block wtd (3/8, 5/8) average for (dst, src)
-void vp9_convolve_3by8(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int x_step_q4,
-                       const int16_t *filter_y, int y_step_q4,
-                       int w, int h);
-
-// Not a convolution, a block wtd (5/8, 3/8) average for (dst, src)
-void vp9_convolve_5by8(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int x_step_q4,
-                       const int16_t *filter_y, int y_step_q4,
-                       int w, int h);
-
-// Not a convolution, a block wtd (3/4, 1/4) average for (dst, src)
-void vp9_convolve_3qtr(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int x_step_q4,
-                       const int16_t *filter_y, int y_step_q4,
-                       int w, int h);
-
-// Not a convolution, a block wtd (7/8, 1/8) average for (dst, src)
-void vp9_convolve_7by8(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int x_step_q4,
-                       const int16_t *filter_y, int y_step_q4,
-                       int w, int h);
-#endif
-
 struct subpix_fn_table {
   const int16_t (*filter_x)[8];
   const int16_t (*filter_y)[8];
--- a/vp9/common/vp9_debugmodes.c
+++ b/vp9/common/vp9_debugmodes.c
@@ -13,130 +13,124 @@
 #include "vp9/common/vp9_blockd.h"
 
 void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols,
-                                        int frame) {
-  int mb_row;
-  int mb_col;
-  int mb_index = 0;
-  FILE *mvs = fopen("mvs.stt", "a");
+                                        int frame, char *file) {
+  int mi_row;
+  int mi_col;
+  int mi_index = 0;
+  FILE *mvs = fopen(file, "a");
 
   // Print out the macroblock Y modes
-  fprintf(mvs, "Mb Modes for Frame %d\n", frame);
+  fprintf(mvs, "SB Types for Frame %d\n", frame);
 
-  for (mb_row = 0; mb_row < rows; mb_row++) {
-    for (mb_col = 0; mb_col < cols; mb_col++) {
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%2d ", mi[mi_index].mbmi.sb_type);
 
-      fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode);
+      mi_index++;
+    }
 
-      mb_index++;
+    fprintf(mvs, "\n");
+    mi_index += 8;
+  }
+
+  // Print out the macroblock Y modes
+  fprintf(mvs, "Mb Modes for Frame %d\n", frame);
+  mi_index = 0;
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%2d ", mi[mi_index].mbmi.mode);
+
+      mi_index++;
     }
 
     fprintf(mvs, "\n");
-    mb_index++;
+    mi_index += 8;
   }
 
   fprintf(mvs, "\n");
 
-  mb_index = 0;
+  mi_index = 0;
   fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
 
-  for (mb_row = 0; mb_row < rows; mb_row++) {
-    for (mb_col = 0; mb_col < cols; mb_col++) {
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%2d ", mi[mi_index].mbmi.ref_frame[0]);
 
-      fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame);
-
-      mb_index++;
+      mi_index++;
     }
 
     fprintf(mvs, "\n");
-    mb_index++;
+    mi_index += 8;
   }
-
   fprintf(mvs, "\n");
 
-  /* print out the macroblock UV modes */
-  mb_index = 0;
-  fprintf(mvs, "UV Modes for Frame %d\n", frame);
+  mi_index = 0;
+  fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
 
-  for (mb_row = 0; mb_row < rows; mb_row++) {
-    for (mb_col = 0; mb_col < cols; mb_col++) {
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%4d:%4d ", mi[mi_index].mbmi.mv[0].as_mv.row,
+              mi[mi_index].mbmi.mv[0].as_mv.col);
 
-      fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode);
-
-      mb_index++;
+      mi_index++;
     }
 
-    mb_index++;
     fprintf(mvs, "\n");
+    mi_index += 8;
   }
 
   fprintf(mvs, "\n");
 
-  /* print out the block modes */
-  mb_index = 0;
-  fprintf(mvs, "Mbs for Frame %d\n", frame);
-  {
-    int b_row;
+  /* print out the macroblock txform sizes */
+  mi_index = 0;
+  fprintf(mvs, "TXFM size for Frame %d\n", frame);
 
-    for (b_row = 0; b_row < 4 * rows; b_row++) {
-      int b_col;
-      int bindex;
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%2d ", mi[mi_index].mbmi.txfm_size);
 
-      for (b_col = 0; b_col < 4 * cols; b_col++) {
-        mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
-        bindex = (b_row & 3) * 4 + (b_col & 3);
-
-        if (mi[mb_index].mbmi.mode == B_PRED) {
-          fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.first);
-        } else
-          fprintf(mvs, "xx ");
-
-      }
-
-      fprintf(mvs, "\n");
+      mi_index++;
     }
+
+    mi_index += 8;
+    fprintf(mvs, "\n");
   }
+
   fprintf(mvs, "\n");
 
-  /* print out the macroblock mvs */
-  mb_index = 0;
-  fprintf(mvs, "MVs for Frame %d\n", frame);
+  /* print out the macroblock UV modes */
+  mi_index = 0;
+  fprintf(mvs, "UV Modes for Frame %d\n", frame);
 
-  for (mb_row = 0; mb_row < rows; mb_row++) {
-    for (mb_col = 0; mb_col < cols; mb_col++) {
-      fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv[0].as_mv.row / 2,
-          mi[mb_index].mbmi.mv[0].as_mv.col / 2);
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%2d ", mi[mi_index].mbmi.uv_mode);
 
-      mb_index++;
+      mi_index++;
     }
 
-    mb_index++;
+    mi_index += 8;
     fprintf(mvs, "\n");
   }
 
   fprintf(mvs, "\n");
 
-  /* print out the block modes */
-  mb_index = 0;
+  /* print out the macroblock mvs */
+  mi_index = 0;
   fprintf(mvs, "MVs for Frame %d\n", frame);
-  {
-    int b_row;
 
-    for (b_row = 0; b_row < 4 * rows; b_row++) {
-      int b_col;
-      int bindex;
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%5d:%-5d", mi[mi_index].mbmi.mv[0].as_mv.row / 2,
+              mi[mi_index].mbmi.mv[0].as_mv.col / 2);
 
-      for (b_col = 0; b_col < 4 * cols; b_col++) {
-        mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
-        bindex = (b_row & 3) * 4 + (b_col & 3);
-        fprintf(mvs, "%3d:%-3d ",
-                mi[mb_index].bmi[bindex].as_mv[0].as_mv.row,
-                mi[mb_index].bmi[bindex].as_mv[0].as_mv.col);
-
-      }
-
-      fprintf(mvs, "\n");
+      mi_index++;
     }
+
+    mi_index += 8;
+    fprintf(mvs, "\n");
   }
+
   fprintf(mvs, "\n");
 
   fclose(mvs);
--- a/vp9/common/vp9_default_coef_probs.h
+++ b/vp9/common/vp9_default_coef_probs.h
@@ -11,987 +11,1374 @@
 
 /*Generated file, included by vp9_entropy.c*/
 
-// NOTE: When the CONFIG_MODELCOEFPROB experiment is on, only the first
-// 2 or 3 from each row is actually used depending on whether
-// UNCONSTRAINDED_NODES is 2 or 3. If this experiment is merged
-// the tables below should be shortened accordingly.
-static const vp9_coeff_probs default_coef_probs_4x4[BLOCK_TYPES] = {
+#if CONFIG_BALANCED_COEFTREE
+static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = {
   { /* block Type 0 */
     { /* Intra */
       { /* Coeff Band 0 */
-        { 208,  32, 178, 198, 161, 167, 196, 147, 244, 194, 210 },
-        { 102,  43, 132, 185, 148, 162, 185, 141, 237, 181, 215 },
-        {  15,  36,  68, 143, 119, 151, 169, 133, 230, 173, 214 }
+        {   6, 213, 178 },
+        {  26, 113, 132 },
+        {  34,  17,  68 }
       }, { /* Coeff Band 1 */
-        {  71,  91, 178, 226, 169, 176, 232, 170, 252, 219, 231 },
-        {  72,  88, 174, 226, 168, 176, 232, 170, 252, 219, 234 },
-        {  40,  79, 154, 222, 161, 174, 231, 169, 251, 219, 238 },
-        {  21,  68, 126, 211, 144, 167, 230, 167, 252, 219, 236 },
-        {   7,  49,  84, 175, 121, 152, 223, 151, 251, 218, 237 },
-        {   1,  20,  32, 100,  97, 140, 163, 116, 237, 186, 222 }
+        {  66,  96, 178 },
+        {  63,  96, 174 },
+        {  67,  54, 154 },
+        {  62,  28, 126 },
+        {  48,   9,  84 },
+        {  20,   1,  32 }
       }, { /* Coeff Band 2 */
-        { 108, 110, 206, 237, 182, 183, 239, 181, 252, 221, 245 },
-        {  72,  98, 191, 236, 180, 182, 240, 183, 252, 223, 239 },
-        {  26,  77, 152, 230, 166, 179, 239, 181, 252, 222, 241 },
-        {   7,  57, 106, 212, 141, 167, 236, 173, 252, 223, 243 },
-        {   1,  35,  60, 171, 110, 149, 225, 155, 251, 218, 240 },
-        {   1,  14,  22,  90,  86, 134, 163, 116, 238, 181, 233 }
+        {  64, 144, 206 },
+        {  70,  99, 191 },
+        {  69,  36, 152 },
+        {  55,   9, 106 },
+        {  35,   1,  60 },
+        {  14,   1,  22 }
       }, { /* Coeff Band 3 */
-        { 105, 139, 222, 245, 196, 192, 245, 195, 253, 229, 255 },
-        {  76, 118, 205, 245, 192, 192, 247, 198, 254, 230, 255 },
-        {  21,  88, 164, 240, 175, 186, 246, 197, 255, 232, 255 },
-        {   5,  63, 118, 222, 149, 172, 242, 185, 255, 230, 254 },
-        {   1,  42,  74, 186, 120, 157, 227, 161, 253, 220, 250 },
-        {   1,  18,  30,  97,  92, 136, 163, 118, 244, 184, 244 }
+        {  82, 154, 222 },
+        {  83, 112, 205 },
+        {  81,  31, 164 },
+        {  62,   7, 118 },
+        {  42,   1,  74 },
+        {  18,   1,  30 }
       }, { /* Coeff Band 4 */
-        { 143, 117, 233, 251, 207, 201, 250, 210, 255, 239, 128 },
-        {  99, 104, 214, 249, 200, 199, 251, 211, 255, 238, 255 },
-        {  26,  81, 170, 245, 183, 192, 250, 206, 255, 242, 255 },
-        {   6,  60, 116, 226, 151, 176, 242, 187, 255, 235, 255 },
-        {   1,  38,  65, 178, 114, 153, 224, 157, 254, 224, 255 },
-        {   1,  15,  26,  86,  88, 133, 163, 110, 251, 197, 252 }
+        {  52, 179, 233 },
+        {  64, 132, 214 },
+        {  73,  36, 170 },
+        {  59,   8, 116 },
+        {  38,   1,  65 },
+        {  15,   1,  26 }
       }, { /* Coeff Band 5 */
-        { 155,  74, 238, 252, 215, 206, 252, 223, 255, 255, 128 },
-        { 152,  64, 223, 250, 205, 201, 254, 219, 255, 255, 128 },
-        {  67,  55, 182, 246, 187, 192, 251, 210, 255, 240, 128 },
-        {  27,  44, 127, 227, 155, 176, 244, 186, 255, 240, 255 },
-        {   9,  27,  69, 176, 115, 152, 227, 154, 255, 229, 255 },
-        {   2,  11,  28,  91,  84, 133, 177, 115, 254, 210, 255 }
+        {  29, 175, 238 },
+        {  26, 169, 223 },
+        {  41,  80, 182 },
+        {  39,  32, 127 },
+        {  26,  10,  69 },
+        {  11,   2,  28 }
       }
     }, { /* Inter */
       { /* Coeff Band 0 */
-        { 207, 112, 234, 244, 192, 193, 246, 194, 255, 237, 255 },
-        { 145, 120, 212, 233, 178, 183, 232, 177, 252, 216, 228 },
-        {  77, 114, 177, 214, 164, 174, 210, 159, 245, 199, 230 }
+        {  21, 226, 234 },
+        {  52, 182, 212 },
+        {  80, 112, 177 }
       }, { /* Coeff Band 1 */
-        {  93, 174, 243, 248, 205, 200, 245, 195, 255, 232, 255 },
-        { 100, 144, 231, 248, 204, 200, 244, 193, 255, 232, 255 },
-        {  28, 101, 186, 247, 194, 199, 244, 194, 255, 232, 255 },
-        {   9,  73, 132, 238, 155, 186, 245, 197, 255, 232, 250 },
-        {   2,  44,  76, 187, 112, 151, 240, 172, 255, 235, 249 },
-        {   1,  19,  33,  98,  92, 138, 176, 113, 252, 208, 249 }
+        { 111, 164, 243 },
+        {  88, 152, 231 },
+        {  90,  43, 186 },
+        {  70,  12, 132 },
+        {  44,   2,  76 },
+        {  19,   1,  33 }
       }, { /* Coeff Band 2 */
-        { 116, 175, 246, 250, 212, 202, 248, 198, 255, 238, 255 },
-        {  78, 142, 231, 250, 208, 203, 249, 200, 255, 241, 255 },
-        {  14,  93, 177, 245, 186, 196, 248, 198, 255, 241, 255 },
-        {   4,  65, 122, 227, 148, 177, 244, 186, 255, 241, 243 },
-        {   1,  38,  69, 180, 111, 152, 235, 162, 255, 237, 247 },
-        {   1,  18,  30, 101,  89, 133, 190, 116, 255, 219, 246 }
+        {  96, 185, 246 },
+        {  99, 127, 231 },
+        {  88,  21, 177 },
+        {  64,   5, 122 },
+        {  38,   1,  69 },
+        {  18,   1,  30 }
       }, { /* Coeff Band 3 */
-        { 138, 183, 249, 253, 220, 209, 252, 210, 255, 251, 128 },
-        {  93, 147, 237, 252, 213, 209, 253, 213, 255, 251, 128 },
-        {  21, 104, 187, 247, 185, 196, 252, 210, 255, 249, 128 },
-        {   6,  73, 131, 225, 147, 174, 248, 190, 255, 248, 128 },
-        {   1,  47,  83, 189, 119, 155, 239, 167, 255, 246, 128 },
-        {   1,  26,  44, 130,  96, 139, 209, 129, 255, 235, 255 }
+        {  84, 206, 249 },
+        {  94, 147, 237 },
+        {  95,  33, 187 },
+        {  71,   8, 131 },
+        {  47,   1,  83 },
+        {  26,   1,  44 }
       }, { /* Coeff Band 4 */
-        { 188, 143, 252, 255, 228, 218, 253, 218, 255, 209, 128 },
-        { 137, 124, 241, 253, 215, 211, 254, 221, 255, 255, 128 },
-        {  32,  89, 188, 248, 186, 198, 254, 216, 255, 253, 128 },
-        {   7,  61, 122, 231, 146, 176, 252, 201, 255, 250, 128 },
-        {   1,  34,  66, 186, 103, 149, 246, 176, 255, 249, 128 },
-        {   1,  18,  34, 115,  91, 134, 217, 124, 255, 233, 255 }
+        {  38, 221, 252 },
+        {  58, 177, 241 },
+        {  78,  46, 188 },
+        {  59,   9, 122 },
+        {  34,   1,  66 },
+        {  18,   1,  34 }
       }, { /* Coeff Band 5 */
-        { 198,  92, 253, 255, 231, 222, 255, 230, 128, 128, 128 },
-        { 189,  79, 244, 254, 220, 217, 255, 237, 255, 255, 128 },
-        {  78,  61, 200, 252, 196, 207, 255, 231, 255, 255, 128 },
-        {  34,  50, 146, 242, 161, 187, 255, 222, 255, 255, 128 },
-        {  11,  38,  93, 215, 122, 159, 253, 202, 255, 255, 128 },
-        {   1,  31,  55, 143, 102, 143, 227, 148, 255, 238, 128 }
+        {  21, 216, 253 },
+        {  21, 206, 244 },
+        {  42,  93, 200 },
+        {  43,  41, 146 },
+        {  36,  13,  93 },
+        {  31,   1,  55 }
       }
     }
   }, { /* block Type 1 */
     { /* Intra */
       { /* Coeff Band 0 */
-        { 207,  35, 219, 243, 195, 192, 243, 188, 251, 232, 238 },
-        { 126,  46, 182, 230, 177, 182, 228, 171, 248, 214, 232 },
-        {  51,  47, 125, 196, 147, 166, 206, 151, 245, 199, 229 }
+        {   7, 213, 219 },
+        {  23, 139, 182 },
+        {  38,  60, 125 }
       }, { /* Coeff Band 1 */
-        { 114, 124, 220, 244, 197, 192, 242, 189, 253, 226, 255 },
-        { 142, 116, 213, 243, 194, 191, 241, 188, 252, 226, 255 },
-        {  81, 101, 190, 242, 188, 190, 242, 190, 253, 229, 255 },
-        {  42,  83, 155, 235, 166, 183, 241, 190, 253, 227, 246 },
-        {  16,  62, 104, 205, 133, 161, 238, 176, 254, 227, 250 },
-        {   6,  40,  60, 132, 109, 145, 190, 128, 248, 202, 239 }
+        {  69, 156, 220 },
+        {  52, 178, 213 },
+        {  69, 111, 190 },
+        {  69,  58, 155 },
+        {  58,  21, 104 },
+        {  39,   7,  60 }
       }, { /* Coeff Band 2 */
-        { 139, 149, 228, 248, 205, 198, 244, 196, 255, 223, 255 },
-        { 115, 127, 221, 248, 202, 198, 245, 198, 255, 228, 255 },
-        {  43, 100, 189, 246, 195, 195, 244, 196, 254, 234, 228 },
-        {  13,  77, 141, 238, 168, 187, 243, 191, 255, 232, 255 },
-        {   3,  49,  88, 203, 125, 160, 237, 178, 253, 227, 251 },
-        {   1,  23,  41, 118,  97, 136, 191, 127, 250, 207, 247 }
+        {  68, 189, 228 },
+        {  70, 158, 221 },
+        {  83,  64, 189 },
+        {  73,  18, 141 },
+        {  48,   4,  88 },
+        {  23,   1,  41 }
       }, { /* Coeff Band 3 */
-        { 119, 185, 236, 251, 216, 205, 249, 202, 253, 237, 255 },
-        {  89, 140, 224, 251, 211, 205, 250, 208, 255, 241, 255 },
-        {  34, 105, 189, 248, 195, 197, 250, 208, 255, 245, 255 },
-        {  14,  78, 142, 235, 166, 182, 246, 194, 255, 242, 255 },
-        {   5,  49,  90, 196, 128, 160, 235, 165, 255, 237, 255 },
-        {   1,  22,  41, 114,  97, 139, 180, 124, 252, 201, 249 }
+        {  99, 194, 236 },
+        {  91, 138, 224 },
+        {  91,  53, 189 },
+        {  74,  20, 142 },
+        {  48,   6,  90 },
+        {  22,   1,  41 }
       }, { /* Coeff Band 4 */
-        { 162, 142, 244, 254, 228, 215, 255, 230, 128, 128, 128 },
-        { 129, 120, 231, 253, 216, 210, 255, 228, 255, 255, 128 },
-        {  44,  90, 189, 249, 195, 199, 253, 217, 255, 240, 128 },
-        {  14,  65, 132, 234, 158, 181, 249, 203, 255, 248, 128 },
-        {   3,  38,  72, 188, 112, 154, 239, 171, 255, 243, 128 },
-        {   1,  17,  39, 110,  86, 141, 201, 123, 255, 240, 128 }
+        {  52, 203, 244 },
+        {  60, 168, 231 },
+        {  75,  62, 189 },
+        {  61,  18, 132 },
+        {  38,   4,  72 },
+        {  17,   1,  39 }
       }, { /* Coeff Band 5 */
-        { 167,  96, 247, 255, 230, 218, 249, 231, 255, 255, 128 },
-        { 163,  84, 234, 253, 214, 209, 255, 231, 255, 255, 128 },
-        {  70,  63, 185, 249, 189, 197, 255, 230, 255, 255, 128 },
-        {  30,  44, 132, 238, 157, 180, 251, 210, 255, 220, 128 },
-        {  13,  30,  80, 195, 121, 153, 243, 179, 255, 224, 128 },
-        {   5,  13,  38, 103, 109, 128, 196, 147, 255, 255, 128 }
+        {  33, 192, 247 },
+        {  31, 185, 234 },
+        {  46,  85, 185 },
+        {  39,  35, 132 },
+        {  28,  15,  80 },
+        {  13,   5,  38 }
       }
     }, { /* Inter */
       { /* Coeff Band 0 */
-        { 242,  90, 246, 244, 200, 192, 242, 189, 255, 234, 255 },
-        { 186, 102, 228, 233, 187, 182, 231, 172, 254, 225, 252 },
-        { 102, 108, 203, 228, 181, 180, 218, 167, 243, 201, 223 }
+        {   5, 247, 246 },
+        {  28, 209, 228 },
+        {  65, 137, 203 }
       }, { /* Coeff Band 1 */
-        { 152, 169, 250, 253, 223, 209, 251, 208, 255, 250, 128 },
-        { 164, 149, 242, 253, 222, 209, 249, 207, 253, 238, 255 },
-        {  63, 108, 204, 252, 215, 211, 251, 211, 255, 242, 128 },
-        {  39,  83, 153, 248, 175, 199, 250, 214, 255, 245, 128 },
-        {  31,  66, 108, 214, 130, 161, 251, 196, 255, 237, 128 },
-        {  27,  65,  71, 150, 112, 149, 213, 133, 255, 230, 255 }
+        {  69, 208, 250 },
+        {  54, 207, 242 },
+        {  81,  92, 204 },
+        {  70,  54, 153 },
+        {  58,  40, 108 },
+        {  58,  35,  71 }
       }, { /* Coeff Band 2 */
-        { 161, 174, 250, 254, 226, 215, 254, 226, 255, 230, 128 },
-        { 133, 150, 239, 254, 222, 213, 254, 225, 255, 255, 128 },
-        {  32, 105, 197, 252, 206, 207, 253, 220, 255, 255, 128 },
-        {  10,  78, 147, 245, 173, 193, 253, 212, 255, 255, 128 },
-        {   2,  49,  99, 221, 133, 164, 250, 198, 255, 252, 128 },
-        {   1,  26,  53, 154,  96, 135, 234, 142, 255, 240, 128 }
+        {  65, 215, 250 },
+        {  72, 185, 239 },
+        {  92,  50, 197 },
+        {  75,  14, 147 },
+        {  49,   2,  99 },
+        {  26,   1,  53 }
       }, { /* Coeff Band 3 */
-        { 160, 187, 251, 255, 234, 223, 255, 233, 128, 128, 128 },
-        { 131, 155, 241, 255, 228, 222, 255, 232, 255, 255, 128 },
-        {  42, 108, 198, 253, 207, 212, 255, 234, 255, 255, 128 },
-        {  18,  81, 151, 246, 176, 194, 254, 222, 255, 255, 128 },
-        {   9,  60, 112, 225, 144, 167, 252, 199, 255, 255, 128 },
-        {   5,  35,  49, 163, 113, 150, 237, 118, 255, 255, 128 }
+        {  70, 220, 251 },
+        {  76, 186, 241 },
+        {  90,  65, 198 },
+        {  75,  26, 151 },
+        {  58,  12, 112 },
+        {  34,   6,  49 }
       }, { /* Coeff Band 4 */
-        { 195, 141, 253, 255, 242, 232, 255, 255, 128, 128, 128 },
-        { 169, 128, 245, 255, 235, 227, 255, 248, 128, 128, 128 },
-        {  62,  91, 204, 255, 216, 220, 255, 233, 128, 128, 128 },
-        {  23,  70, 150, 248, 178, 202, 255, 223, 128, 128, 128 },
-        {   2,  44,  78, 220, 110, 164, 255, 209, 128, 128, 128 },
-        {   1,   1, 128, 255, 255, 128, 128, 128, 128, 128, 128 }
+        {  34, 224, 253 },
+        {  44, 204, 245 },
+        {  69,  85, 204 },
+        {  64,  31, 150 },
+        {  44,   2,  78 },
+        {   1,   1, 128 }
       }, { /* Coeff Band 5 */
-        { 195, 104, 253, 255, 246, 246, 255, 171, 128, 128, 128 },
-        { 197,  92, 248, 255, 239, 228, 255, 239, 128, 128, 128 },
-        {  88,  71, 214, 255, 219, 220, 255, 244, 128, 128, 128 },
-        {  39,  56, 160, 250, 187, 204, 255, 255, 128, 128, 128 },
-        {  18,  28,  90, 217,  81, 137, 255, 128, 128, 128, 128 },
-        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+        {  25, 216, 253 },
+        {  21, 215, 248 },
+        {  47, 108, 214 },
+        {  47,  48, 160 },
+        {  26,  20,  90 },
+        {  64, 171, 128 }
       }
     }
   }
 };
-static const vp9_coeff_probs default_coef_probs_8x8[BLOCK_TYPES] = {
+static const vp9_coeff_probs_model default_coef_probs_8x8[BLOCK_TYPES] = {
   { /* block Type 0 */
     { /* Intra */
       { /* Coeff Band 0 */
-        { 196,  40, 199, 180, 158, 161, 172, 135, 226, 183, 140 },
-        {  83,  38, 128, 153, 142, 157, 155, 128, 222, 164, 202 },
-        {  10,  29,  55, 116, 113, 146, 150, 122, 223, 169, 200 }
+        {   9, 203, 199 },
+        {  26,  92, 128 },
+        {  28,  11,  55 }
       }, { /* Coeff Band 1 */
-        {  33, 114, 160, 211, 155, 169, 223, 162, 248, 212, 215 },
-        {  69, 107, 155, 210, 154, 169, 224, 163, 248, 212, 216 },
-        {  30,  91, 138, 207, 150, 168, 223, 162, 248, 212, 216 },
-        {  12,  74, 115, 200, 140, 164, 222, 160, 249, 212, 219 },
-        {   4,  52,  80, 172, 121, 153, 216, 149, 249, 212, 226 },
-        {   1,  27,  40, 105, 101, 141, 157, 120, 231, 177, 210 }
+        {  99,  54, 160 },
+        {  78,  99, 155 },
+        {  80,  44, 138 },
+        {  71,  17, 115 },
+        {  51,   5,  80 },
+        {  27,   1,  40 }
       }, { /* Coeff Band 2 */
-        {  38, 159, 190, 227, 171, 177, 229, 172, 250, 214, 237 },
-        {  34, 130, 182, 229, 173, 180, 231, 174, 249, 215, 234 },
-        {  10,  97, 153, 226, 164, 178, 232, 175, 250, 215, 241 },
-        {   3,  71, 115, 213, 145, 170, 230, 171, 251, 217, 235 },
-        {   1,  41,  68, 172, 114, 152, 219, 154, 250, 212, 235 },
-        {   1,  16,  27,  88,  90, 135, 155, 113, 235, 180, 216 }
+        { 135,  81, 190 },
+        { 113,  61, 182 },
+        {  93,  16, 153 },
+        {  70,   4, 115 },
+        {  41,   1,  68 },
+        {  16,   1,  27 }
       }, { /* Coeff Band 3 */
-        {  41, 184, 214, 238, 187, 186, 235, 180, 252, 217, 236 },
-        {  24, 142, 199, 241, 188, 189, 237, 184, 252, 220, 235 },
-        {   6,  97, 159, 235, 172, 184, 239, 185, 252, 221, 243 },
-        {   1,  63, 110, 214, 144, 170, 234, 174, 253, 223, 243 },
-        {   1,  32,  58, 166, 109, 149, 218, 152, 251, 215, 238 },
-        {   1,  12,  21,  78,  85, 131, 152, 109, 236, 180, 224 }
+        { 155, 103, 214 },
+        { 129,  48, 199 },
+        {  95,  10, 159 },
+        {  63,   1, 110 },
+        {  32,   1,  58 },
+        {  12,   1,  21 }
       }, { /* Coeff Band 4 */
-        {  54, 207, 231, 245, 201, 193, 238, 186, 252, 221, 220 },
-        {  32, 156, 213, 246, 198, 195, 242, 192, 252, 224, 245 },
-        {   7,  98, 164, 240, 177, 187, 243, 193, 252, 227, 244 },
-        {   2,  62, 108, 216, 143, 170, 237, 177, 254, 227, 248 },
-        {   1,  32,  57, 165, 108, 148, 219, 152, 252, 217, 243 },
-        {   1,  13,  22,  79,  87, 132, 153, 109, 240, 182, 232 }
+        { 163, 149, 231 },
+        { 137,  69, 213 },
+        {  95,  11, 164 },
+        {  62,   3, 108 },
+        {  32,   1,  57 },
+        {  13,   1,  22 }
       }, { /* Coeff Band 5 */
-        {  89, 208, 239, 250, 216, 200, 240, 190, 255, 222, 219 },
-        {  53, 155, 223, 250, 209, 202, 245, 199, 253, 225, 246 },
-        {  12, 102, 170, 243, 183, 192, 246, 198, 254, 230, 255 },
-        {   3,  67, 111, 218, 144, 171, 239, 180, 254, 231, 248 },
-        {   1,  38,  60, 164, 108, 148, 221, 152, 253, 220, 246 },
-        {   1,  18,  26,  81,  88, 132, 157, 108, 245, 188, 241 }
+        { 136, 189, 239 },
+        { 123, 102, 223 },
+        {  97,  19, 170 },
+        {  66,   4, 111 },
+        {  38,   1,  60 },
+        {  18,   1,  26 }
       }
     }, { /* Inter */
       { /* Coeff Band 0 */
-        { 205, 121, 244, 237, 187, 188, 229, 174, 248, 215, 228 },
-        { 140, 120, 211, 219, 174, 177, 207, 158, 241, 195, 214 },
-        {  51, 100, 152, 198, 155, 168, 199, 148, 240, 193, 207 }
+        {  24, 226, 244 },
+        {  54, 178, 211 },
+        {  80,  74, 152 }
       }, { /* Coeff Band 1 */
-        {  66, 196, 236, 247, 202, 197, 243, 193, 254, 228, 246 },
-        {  99, 164, 223, 246, 199, 196, 243, 193, 254, 226, 255 },
-        {  29, 122, 187, 244, 187, 194, 244, 193, 255, 227, 239 },
-        {  14,  95, 145, 234, 156, 181, 244, 194, 254, 229, 246 },
-        {   6,  68,  97, 190, 123, 155, 240, 168, 254, 232, 245 },
-        {   3,  43,  50, 112, 105, 143, 170, 118, 245, 195, 230 }
+        { 145, 153, 236 },
+        { 101, 163, 223 },
+        { 108,  50, 187 },
+        {  90,  22, 145 },
+        {  66,   8,  97 },
+        {  42,   4,  50 }
       }, { /* Coeff Band 2 */
-        {  66, 202, 238, 248, 206, 199, 245, 196, 254, 233, 244 },
-        {  45, 155, 218, 248, 200, 199, 245, 197, 254, 229, 208 },
-        {   6,  96, 163, 242, 178, 191, 245, 196, 254, 233, 228 },
-        {   2,  64, 110, 224, 142, 175, 242, 185, 254, 232, 247 },
-        {   1,  34,  61, 172, 103, 147, 232, 164, 254, 226, 244 },
-        {   1,  13,  24,  82,  85, 133, 165, 105, 248, 199, 242 }
+        { 150, 159, 238 },
+        { 128,  90, 218 },
+        {  94,   9, 163 },
+        {  64,   3, 110 },
+        {  34,   1,  61 },
+        {  13,   1,  24 }
       }, { /* Coeff Band 3 */
-        {  66, 204, 242, 251, 213, 204, 248, 204, 255, 236, 255 },
-        {  38, 158, 222, 251, 206, 205, 249, 206, 255, 238, 255 },
-        {   6,  95, 166, 244, 178, 194, 249, 205, 255, 236, 255 },
-        {   2,  61, 111, 223, 141, 173, 244, 187, 255, 237, 255 },
-        {   1,  31,  59, 171, 104, 149, 230, 158, 255, 230, 252 },
-        {   1,  12,  22,  82,  79, 128, 171, 111, 251, 203, 249 }
+        { 151, 162, 242 },
+        { 135,  80, 222 },
+        {  93,   9, 166 },
+        {  61,   3, 111 },
+        {  31,   1,  59 },
+        {  12,   1,  22 }
       }, { /* Coeff Band 4 */
-        {  63, 214, 245, 252, 219, 208, 249, 206, 255, 241, 128 },
-        {  38, 164, 228, 252, 210, 208, 251, 212, 255, 245, 255 },
-        {   5, 101, 174, 246, 182, 196, 251, 207, 255, 244, 255 },
-        {   1,  64, 116, 224, 142, 174, 246, 190, 255, 241, 228 },
-        {   1,  34,  63, 172, 105, 148, 233, 160, 255, 235, 237 },
-        {   1,  14,  26,  88,  85, 130, 177, 110, 252, 210, 250 }
+        { 161, 170, 245 },
+        { 140,  84, 228 },
+        {  99,   8, 174 },
+        {  64,   1, 116 },
+        {  34,   1,  63 },
+        {  14,   1,  26 }
       }, { /* Coeff Band 5 */
-        {  91, 214, 246, 254, 226, 213, 251, 210, 255, 239, 255 },
-        {  55, 162, 233, 253, 215, 210, 253, 216, 255, 244, 128 },
-        {  10, 104, 179, 247, 184, 196, 252, 212, 255, 247, 255 },
-        {   2,  67, 119, 226, 143, 173, 249, 195, 255, 245, 255 },
-        {   1,  37,  66, 175, 106, 149, 237, 164, 255, 240, 255 },
-        {   1,  16,  30,  96,  87, 132, 188, 113, 255, 222, 255 }
+        { 138, 197, 246 },
+        { 127, 109, 233 },
+        { 100,  16, 179 },
+        {  66,   3, 119 },
+        {  37,   1,  66 },
+        {  16,   1,  30 }
       }
     }
   }, { /* block Type 1 */
     { /* Intra */
       { /* Coeff Band 0 */
-        { 211,  32, 212, 235, 185, 184, 223, 167, 239, 210, 182 },
-        { 121,  47, 171, 224, 171, 180, 211, 162, 238, 195, 221 },
-        {  40,  51, 118, 203, 145, 168, 211, 160, 246, 200, 236 }
+        {   6, 216, 212 },
+        {  25, 134, 171 },
+        {  43,  48, 118 }
       }, { /* Coeff Band 1 */
-        {  71, 129, 209, 244, 192, 194, 242, 188, 255, 230, 255 },
-        { 118, 122, 206, 244, 192, 192, 241, 187, 254, 227, 255 },
-        {  53, 104, 184, 241, 186, 190, 241, 184, 254, 232, 255 },
-        {  20,  81, 148, 234, 168, 183, 240, 183, 254, 231, 240 },
-        {   3,  47,  82, 197, 127, 160, 234, 166, 254, 228, 251 },
-        {   1,  18,  28,  96,  88, 134, 174, 116, 247, 194, 247 }
+        {  93, 112, 209 },
+        {  66, 159, 206 },
+        {  82,  78, 184 },
+        {  75,  28, 148 },
+        {  46,   4,  82 },
+        {  18,   1,  28 }
       }, { /* Coeff Band 2 */
-        {  86, 162, 220, 247, 203, 198, 245, 193, 255, 237, 255 },
-        {  84, 134, 216, 247, 201, 197, 244, 192, 255, 233, 255 },
-        {  26, 102, 186, 243, 190, 192, 244, 192, 255, 232, 255 },
-        {   7,  75, 135, 231, 163, 181, 240, 183, 255, 234, 255 },
-        {   1,  46,  79, 193, 121, 157, 233, 168, 255, 225, 242 },
-        {   1,  20,  35, 113,  94, 136, 191, 123, 252, 209, 250 }
+        { 108, 148, 220 },
+        {  90, 130, 216 },
+        {  92,  40, 186 },
+        {  73,  10, 135 },
+        {  46,   1,  79 },
+        {  20,   1,  35 }
       }, { /* Coeff Band 3 */
-        {  89, 191, 232, 250, 211, 203, 248, 202, 255, 230, 128 },
-        {  67, 148, 223, 250, 207, 201, 250, 207, 255, 247, 255 },
-        {  19, 105, 183, 245, 189, 193, 249, 202, 255, 244, 255 },
-        {   5,  72, 127, 228, 156, 177, 245, 186, 255, 238, 255 },
-        {   1,  44,  76, 190, 119, 156, 234, 167, 255, 231, 255 },
-        {   1,  21,  36, 116,  92, 138, 195, 128, 250, 208, 241 }
+        { 125, 173, 232 },
+        { 109, 117, 223 },
+        {  97,  31, 183 },
+        {  71,   7, 127 },
+        {  44,   1,  76 },
+        {  21,   1,  36 }
       }, { /* Coeff Band 4 */
-        {  94, 210, 236, 252, 215, 206, 253, 209, 255, 247, 128 },
-        {  68, 153, 224, 251, 209, 204, 251, 213, 255, 240, 128 },
-        {  14, 103, 178, 246, 188, 195, 251, 209, 255, 239, 128 },
-        {   2,  70, 122, 230, 154, 177, 247, 194, 255, 239, 128 },
-        {   1,  42,  72, 189, 115, 153, 234, 166, 255, 229, 255 },
-        {   1,  19,  34, 104,  98, 143, 180, 124, 252, 200, 255 }
+        { 133, 195, 236 },
+        { 112, 121, 224 },
+        {  97,  23, 178 },
+        {  69,   3, 122 },
+        {  42,   1,  72 },
+        {  19,   1,  34 }
       }, { /* Coeff Band 5 */
-        {  87, 200, 238, 254, 226, 214, 250, 212, 255, 226, 128 },
-        {  55, 151, 225, 253, 217, 212, 253, 217, 255, 233, 128 },
-        {  11, 106, 179, 249, 193, 200, 252, 213, 255, 247, 128 },
-        {   2,  72, 124, 232, 155, 180, 246, 195, 255, 230, 128 },
-        {   1,  42,  70, 182, 114, 153, 232, 163, 255, 236, 255 },
-        {   1,  17,  28,  95,  92, 137, 170, 115, 252, 208, 228 }
+        { 132, 180, 238 },
+        { 119, 102, 225 },
+        { 101,  18, 179 },
+        {  71,   3, 124 },
+        {  42,   1,  70 },
+        {  17,   1,  28 }
       }
     }, { /* Inter */
       { /* Coeff Band 0 */
-        { 238,  66, 250, 245, 205, 193, 232, 180, 254, 228, 255 },
-        { 178,  84, 226, 237, 192, 185, 230, 176, 253, 217, 251 },
-        {  76,  83, 168, 218, 166, 173, 225, 162, 252, 220, 243 }
+        {   5, 242, 250 },
+        {  26, 198, 226 },
+        {  58,  98, 168 }
       }, { /* Coeff Band 1 */
-        { 137, 176, 246, 252, 218, 207, 251, 208, 255, 238, 128 },
-        { 176, 160, 237, 252, 217, 206, 249, 209, 255, 247, 128 },
-        {  68, 128, 205, 251, 209, 207, 251, 207, 255, 248, 128 },
-        {  40, 105, 167, 246, 172, 192, 252, 215, 255, 247, 128 },
-        {  22,  84, 131, 214, 144, 164, 249, 185, 255, 250, 255 },
-        {  11,  60,  91, 161, 130, 155, 194, 133, 253, 214, 255 }
+        {  82, 201, 246 },
+        {  50, 219, 237 },
+        {  94, 107, 205 },
+        {  89,  61, 167 },
+        {  77,  31, 131 },
+        {  57,  14,  91 }
       }, { /* Coeff Band 2 */
-        { 124, 192, 247, 253, 223, 210, 254, 215, 255, 255, 128 },
-        { 103, 161, 234, 253, 218, 209, 253, 214, 255, 255, 128 },
-        {  19, 108, 190, 250, 202, 202, 251, 213, 255, 241, 128 },
-        {   6,  74, 131, 242, 165, 191, 251, 207, 255, 244, 128 },
-        {   1,  41,  72, 198, 111, 151, 249, 185, 255, 248, 128 },
-        {   1,  14,  24,  82,  90, 140, 185,  96, 254, 224, 255 }
+        {  99, 202, 247 },
+        {  96, 165, 234 },
+        { 100,  31, 190 },
+        {  72,   8, 131 },
+        {  41,   1,  72 },
+        {  14,   1,  24 }
       }, { /* Coeff Band 3 */
-        { 118, 200, 248, 254, 228, 216, 254, 222, 255, 213, 128 },
-        {  91, 166, 235, 254, 220, 212, 254, 223, 255, 233, 128 },
-        {  16, 110, 186, 251, 197, 201, 255, 225, 255, 255, 128 },
-        {   3,  72, 124, 239, 160, 186, 253, 209, 255, 239, 128 },
-        {   1,  39,  66, 198, 106, 151, 248, 191, 255, 247, 128 },
-        {   1,  14,  19,  94,  74, 124, 209, 109, 255, 245, 128 }
+        { 108, 204, 248 },
+        { 107, 156, 235 },
+        { 103,  27, 186 },
+        {  71,   4, 124 },
+        {  39,   1,  66 },
+        {  14,   1,  19 }
       }, { /* Coeff Band 4 */
-        { 112, 213, 248, 255, 231, 218, 255, 234, 255, 255, 128 },
-        {  80, 172, 234, 254, 220, 216, 255, 233, 255, 255, 128 },
-        {  11, 112, 182, 251, 195, 204, 255, 231, 255, 224, 128 },
-        {   2,  73, 126, 241, 159, 186, 254, 219, 255, 255, 128 },
-        {   1,  40,  69, 207, 111, 159, 249, 191, 255, 255, 128 },
-        {   1,  16,  24,  83,  78, 138, 230, 134, 255, 239, 128 }
+        { 120, 211, 248 },
+        { 118, 149, 234 },
+        { 107,  19, 182 },
+        {  72,   3, 126 },
+        {  40,   1,  69 },
+        {  16,   1,  24 }
       }, { /* Coeff Band 5 */
-        { 100, 209, 245, 255, 236, 225, 248, 231, 255, 192, 128 },
-        {  65, 164, 232, 255, 226, 221, 255, 240, 255, 255, 128 },
-        {  11, 117, 186, 253, 203, 209, 255, 240, 255, 255, 128 },
-        {   2,  83, 136, 245, 167, 191, 253, 222, 255, 255, 128 },
-        {   1,  55,  88, 213, 122, 157, 248, 182, 255, 255, 128 },
-        {   1,  10,  38,  58,  85,  43, 198, 107, 255, 255, 128 }
+        { 127, 199, 245 },
+        { 122, 125, 232 },
+        { 112,  20, 186 },
+        {  82,   3, 136 },
+        {  55,   1,  88 },
+        {  10,   1,  38 }
       }
     }
   }
 };
-static const vp9_coeff_probs default_coef_probs_16x16[BLOCK_TYPES] = {
+static const vp9_coeff_probs_model default_coef_probs_16x16[BLOCK_TYPES] = {
   { /* block Type 0 */
     { /* Intra */
       { /* Coeff Band 0 */
-        {   8,  26, 101, 170, 141, 159, 166, 138, 205, 164, 158 },
-        {   2,  25,  67, 119, 124, 152, 121, 123, 189, 145, 175 },
-        {   1,  15,  28,  67, 102, 139,  95, 107, 191, 136, 187 }
+        {  25,   9, 101 },
+        {  25,   2,  67 },
+        {  15,   1,  28 }
       }, { /* Coeff Band 1 */
-        {  22,  73, 118, 160, 137, 157, 175, 132, 242, 184, 229 },
-        {  43,  73, 116, 160, 137, 157, 177, 132, 242, 185, 231 },
-        {  24,  66, 105, 158, 134, 156, 175, 133, 242, 185, 232 },
-        {   9,  54,  85, 150, 126, 153, 175, 132, 242, 185, 231 },
-        {   2,  34,  54, 123, 109, 145, 168, 124, 242, 183, 231 },
-        {   1,  14,  22,  63,  93, 134, 108, 103, 214, 149, 206 }
+        {  67,  30, 118 },
+        {  61,  56, 116 },
+        {  60,  31, 105 },
+        {  52,  11,  85 },
+        {  34,   2,  54 },
+        {  14,   1,  22 }
       }, { /* Coeff Band 2 */
-        {  34, 123, 149, 186, 148, 163, 195, 143, 245, 195, 233 },
-        {  34, 106, 147, 189, 149, 164, 198, 146, 246, 197, 234 },
-        {  10,  81, 123, 186, 143, 162, 200, 147, 246, 198, 235 },
-        {   2,  56,  87, 170, 127, 156, 201, 143, 248, 202, 234 },
-        {   1,  35,  56, 138, 109, 146, 187, 133, 246, 196, 233 },
-        {   1,  17,  27,  80,  93, 135, 136, 109, 229, 168, 215 }
+        { 107,  58, 149 },
+        {  92,  53, 147 },
+        {  78,  14, 123 },
+        {  56,   3,  87 },
+        {  35,   1,  56 },
+        {  17,   1,  27 }
       }, { /* Coeff Band 3 */
-        {  27, 159, 171, 208, 161, 171, 211, 155, 249, 205, 239 },
-        {  17, 119, 162, 213, 160, 172, 218, 160, 250, 210, 238 },
-        {   3,  81, 128, 207, 149, 168, 220, 161, 250, 213, 238 },
-        {   1,  53,  87, 183, 128, 158, 217, 153, 251, 214, 239 },
-        {   1,  31,  52, 143, 106, 145, 199, 137, 249, 205, 235 },
-        {   1,  14,  24,  77,  89, 133, 142, 109, 234, 174, 215 }
+        { 142,  61, 171 },
+        { 111,  30, 162 },
+        {  80,   4, 128 },
+        {  53,   1,  87 },
+        {  31,   1,  52 },
+        {  14,   1,  24 }
       }, { /* Coeff Band 4 */
-        {  24, 189, 200, 224, 177, 178, 221, 164, 250, 212, 234 },
-        {  14, 136, 184, 230, 176, 181, 228, 172, 252, 215, 231 },
-        {   2,  87, 140, 222, 159, 176, 230, 172, 252, 218, 238 },
-        {   1,  54,  90, 193, 130, 161, 223, 160, 252, 217, 241 },
-        {   1,  28,  49, 142, 103, 144, 202, 139, 250, 208, 233 },
-        {   1,  12,  21,  73,  87, 132, 141, 106, 234, 176, 209 }
+        { 171,  73, 200 },
+        { 129,  28, 184 },
+        {  86,   3, 140 },
+        {  54,   1,  90 },
+        {  28,   1,  49 },
+        {  12,   1,  21 }
       }, { /* Coeff Band 5 */
-        {  32, 220, 227, 242, 199, 190, 234, 180, 251, 220, 232 },
-        {  12, 155, 200, 242, 190, 191, 240, 187, 252, 225, 230 },
-        {   1,  90, 144, 231, 164, 180, 240, 184, 253, 229, 239 },
-        {   1,  53,  90, 198, 130, 162, 230, 165, 253, 226, 238 },
-        {   1,  28,  50, 145, 103, 144, 207, 140, 251, 213, 236 },
-        {   1,  13,  22,  74,  88, 132, 142, 107, 233, 176, 216 }
+        { 193, 129, 227 },
+        { 148,  28, 200 },
+        {  90,   2, 144 },
+        {  53,   1,  90 },
+        {  28,   1,  50 },
+        {  13,   1,  22 }
       }
     }, { /* Inter */
       { /* Coeff Band 0 */
-        {   5,  61, 234, 230, 183, 183, 212, 164, 241, 199, 205 },
-        {   3,  65, 184, 199, 164, 170, 182, 145, 232, 175, 223 },
-        {   1,  56, 104, 154, 137, 158, 156, 131, 221, 165, 210 }
+        {  60,   7, 234 },
+        {  64,   4, 184 },
+        {  56,   1, 104 }
       }, { /* Coeff Band 1 */
-        {  46, 183, 210, 229, 181, 182, 222, 165, 252, 214, 251 },
-        { 122, 166, 202, 228, 179, 181, 223, 164, 252, 217, 250 },
-        {  49, 125, 177, 225, 172, 179, 223, 163, 252, 215, 253 },
-        {  22,  99, 142, 216, 155, 173, 222, 164, 252, 215, 250 },
-        {   8,  69,  95, 180, 127, 156, 220, 153, 252, 214, 250 },
-        {   2,  38,  51, 112, 109, 144, 159, 118, 243, 184, 232 }
+        { 150, 111, 210 },
+        {  87, 185, 202 },
+        { 101,  81, 177 },
+        {  90,  34, 142 },
+        {  67,  11,  95 },
+        {  38,   2,  51 }
       }, { /* Coeff Band 2 */
-        {  56, 196, 218, 236, 187, 185, 231, 172, 254, 223, 239 },
-        {  38, 141, 195, 235, 182, 185, 233, 174, 254, 225, 232 },
-        {   7,  93, 147, 225, 164, 178, 233, 173, 255, 226, 248 },
-        {   2,  63, 101, 201, 137, 165, 227, 162, 254, 225, 248 },
-        {   1,  39,  61, 159, 110, 148, 213, 146, 254, 218, 247 },
-        {   1,  20,  33,  98,  95, 136, 166, 115, 247, 192, 231 }
+        { 153, 139, 218 },
+        { 120,  72, 195 },
+        {  90,  11, 147 },
+        {  63,   3, 101 },
+        {  39,   1,  61 },
+        {  20,   1,  33 }
       }, { /* Coeff Band 3 */
-        {  44, 206, 223, 240, 193, 189, 235, 177, 255, 231, 224 },
-        {  27, 147, 200, 240, 188, 189, 238, 181, 255, 229, 239 },
-        {   4,  93, 147, 230, 165, 180, 238, 180, 255, 231, 237 },
-        {   1,  58,  95, 201, 134, 164, 229, 164, 255, 228, 254 },
-        {   1,  32,  52, 152, 105, 146, 212, 142, 254, 221, 255 },
-        {   1,  14,  23,  81,  87, 133, 156, 109, 248, 191, 236 }
+        { 171, 132, 223 },
+        { 131,  56, 200 },
+        {  92,   6, 147 },
+        {  58,   1,  95 },
+        {  32,   1,  52 },
+        {  14,   1,  23 }
       }, { /* Coeff Band 4 */
-        {  39, 216, 227, 244, 200, 194, 237, 179, 255, 231, 255 },
-        {  22, 152, 204, 243, 192, 193, 240, 186, 255, 231, 240 },
-        {   2,  92, 148, 232, 167, 183, 239, 182, 255, 232, 255 },
-        {   1,  55,  91, 200, 132, 164, 229, 164, 255, 230, 255 },
-        {   1,  28,  47, 144,  99, 142, 211, 141, 255, 222, 251 },
-        {   1,  13,  21,  75,  86, 131, 152, 103, 249, 193, 242 }
+        { 183, 137, 227 },
+        { 139,  48, 204 },
+        {  91,   3, 148 },
+        {  55,   1,  91 },
+        {  28,   1,  47 },
+        {  13,   1,  21 }
       }, { /* Coeff Band 5 */
-        {  34, 228, 234, 249, 213, 201, 246, 194, 255, 239, 255 },
-        {  13, 161, 208, 247, 198, 197, 248, 197, 255, 243, 255 },
-        {   1,  95, 148, 234, 166, 183, 246, 190, 255, 243, 236 },
-        {   1,  55,  90, 199, 128, 161, 237, 168, 255, 239, 255 },
-        {   1,  30,  51, 147, 102, 144, 218, 142, 255, 232, 254 },
-        {   1,  16,  25,  86,  88, 131, 168, 109, 252, 207, 245 }
+        { 198, 149, 234 },
+        { 153,  32, 208 },
+        {  95,   2, 148 },
+        {  55,   1,  90 },
+        {  30,   1,  51 },
+        {  16,   1,  25 }
       }
     }
   }, { /* block Type 1 */
     { /* Intra */
       { /* Coeff Band 0 */
-        { 204,  33, 217, 233, 185, 184, 199, 165, 204, 163, 162 },
-        {  93,  48, 151, 209, 157, 171, 193, 161, 203, 167, 189 },
-        {  18,  43,  86, 173, 126, 156, 203, 149, 231, 193, 200 }
+        {   7, 209, 217 },
+        {  31, 106, 151 },
+        {  40,  21,  86 }
       }, { /* Coeff Band 1 */
-        {  43, 121, 184, 233, 173, 182, 235, 187, 248, 211, 237 },
-        {  93, 117, 177, 232, 170, 180, 235, 182, 246, 204, 224 },
-        {  33, 101, 158, 229, 165, 179, 235, 182, 245, 207, 236 },
-        {  11,  81, 129, 221, 153, 173, 233, 179, 246, 203, 229 },
-        {   2,  51,  82, 188, 124, 158, 224, 162, 248, 206, 228 },
-        {   1,  18,  29,  88,  93, 137, 141, 116, 222, 161, 217 }
+        { 101,  71, 184 },
+        {  74, 131, 177 },
+        {  88,  50, 158 },
+        {  78,  16, 129 },
+        {  51,   2,  82 },
+        {  18,   1,  29 }
       }, { /* Coeff Band 2 */
-        {  63, 154, 199, 239, 184, 187, 236, 187, 248, 209, 221 },
-        {  53, 128, 191, 239, 182, 188, 236, 188, 251, 209, 255 },
-        {  14,  99, 160, 235, 172, 184, 235, 187, 249, 207, 240 },
-        {   4,  75, 122, 219, 150, 173, 226, 177, 250, 204, 240 },
-        {   1,  47,  77, 176, 121, 154, 207, 153, 245, 197, 237 },
-        {   1,  18,  30,  84,  95, 136, 138, 112, 229, 167, 228 }
+        { 116, 115, 199 },
+        { 102,  88, 191 },
+        {  94,  22, 160 },
+        {  74,   6, 122 },
+        {  47,   1,  77 },
+        {  18,   1,  30 }
       }, { /* Coeff Band 3 */
-        {  48, 193, 210, 245, 194, 194, 241, 196, 252, 213, 255 },
-        {  26, 145, 201, 245, 194, 196, 240, 195, 251, 215, 240 },
-        {   6, 104, 165, 241, 179, 190, 239, 191, 253, 222, 255 },
-        {   1,  73, 120, 218, 151, 174, 227, 172, 251, 219, 248 },
-        {   1,  42,  69, 167, 118, 153, 205, 146, 251, 206, 245 },
-        {   1,  16,  27,  84,  89, 133, 148, 112, 240, 179, 238 }
+        { 157, 124, 210 },
+        { 130,  53, 201 },
+        { 102,  10, 165 },
+        {  73,   1, 120 },
+        {  42,   1,  69 },
+        {  16,   1,  27 }
       }, { /* Coeff Band 4 */
-        {  47, 213, 225, 248, 203, 199, 240, 194, 254, 211, 255 },
-        {  32, 153, 212, 248, 201, 199, 241, 196, 251, 226, 255 },
-        {   6, 102, 168, 240, 181, 190, 240, 187, 251, 225, 238 },
-        {   1,  66, 111, 211, 146, 169, 229, 167, 255, 224, 244 },
-        {   1,  36,  60, 157, 110, 148, 209, 143, 252, 215, 255 },
-        {   1,  16,  27,  83,  90, 133, 152, 111, 244, 184, 250 }
+        { 174, 147, 225 },
+        { 134,  67, 212 },
+        { 100,  10, 168 },
+        {  66,   1, 111 },
+        {  36,   1,  60 },
+        {  16,   1,  27 }
       }, { /* Coeff Band 5 */
-        {  46, 225, 232, 252, 219, 208, 247, 204, 254, 233, 255 },
-        {  24, 162, 214, 250, 208, 204, 247, 201, 254, 236, 255 },
-        {   3, 106, 165, 242, 182, 191, 245, 196, 255, 231, 255 },
-        {   1,  66, 108, 213, 142, 169, 235, 175, 255, 226, 247 },
-        {   1,  35,  59, 158, 108, 147, 216, 146, 254, 220, 255 },
-        {   1,  16,  27,  85,  90, 131, 159, 110, 248, 191, 252 }
+        { 185, 165, 232 },
+        { 147,  56, 214 },
+        { 105,   5, 165 },
+        {  66,   1, 108 },
+        {  35,   1,  59 },
+        {  16,   1,  27 }
       }
     }, { /* Inter */
       { /* Coeff Band 0 */
-        { 229,  28, 245, 227, 195, 182, 200, 145, 253, 186, 255 },
-        { 151,  44, 210, 214, 180, 175, 193, 146, 247, 185, 254 },
-        {  55,  48, 131, 183, 148, 163, 194, 138, 249, 201, 246 }
+        {   3, 232, 245 },
+        {  18, 162, 210 },
+        {  38,  64, 131 }
       }, { /* Coeff Band 1 */
-        { 126, 165, 239, 250, 206, 204, 248, 193, 255, 255, 128 },
-        { 199, 158, 231, 248, 206, 198, 247, 200, 243, 255, 255 },
-        { 102, 136, 209, 248, 203, 197, 247, 201, 255, 244, 128 },
-        {  64, 116, 181, 245, 185, 196, 248, 201, 255, 233, 128 },
-        {  44,  98, 151, 233, 162, 179, 248, 195, 255, 242, 128 },
-        {  44,  81, 119, 204, 140, 165, 222, 163, 252, 217, 255 }
+        {  84, 187, 239 },
+        {  35, 231, 231 },
+        {  82, 150, 209 },
+        {  87,  97, 181 },
+        {  81,  64, 151 },
+        {  67,  60, 119 }
       }, { /* Coeff Band 2 */
-        { 108, 185, 239, 252, 216, 209, 248, 205, 255, 230, 128 },
-        {  91, 155, 224, 252, 211, 205, 251, 211, 255, 230, 128 },
-        {  20, 116, 185, 248, 194, 196, 252, 206, 255, 255, 128 },
-        {   8,  86, 141, 239, 168, 185, 248, 196, 255, 247, 128 },
-        {   3,  50,  92, 206, 125, 164, 242, 176, 255, 246, 128 },
-        {   1,  21,  40, 131,  85, 141, 200, 131, 247, 236, 255 }
+        { 107, 185, 239 },
+        { 100, 149, 224 },
+        { 107,  34, 185 },
+        {  83,  12, 141 },
+        {  49,   4,  92 },
+        {  21,   1,  40 }
       }, { /* Coeff Band 3 */
-        {  94, 198, 243, 254, 226, 215, 254, 220, 255, 255, 128 },
-        {  67, 164, 228, 253, 217, 208, 250, 216, 255, 213, 128 },
-        {  14, 120, 185, 250, 196, 205, 248, 205, 255, 255, 128 },
-        {   4,  83, 134, 238, 161, 181, 250, 202, 255, 233, 128 },
-        {   1,  48,  82, 196, 119, 157, 248, 178, 255, 255, 128 },
-        {   1,  26,  38,  96,  84, 132, 221, 110, 255, 209, 128 }
+        { 125, 184, 243 },
+        { 121, 127, 228 },
+        { 113,  25, 185 },
+        {  82,   6, 134 },
+        {  48,   1,  82 },
+        {  26,   1,  38 }
       }, { /* Coeff Band 4 */
-        {  82, 210, 245, 255, 230, 215, 246, 221, 255, 255, 128 },
-        {  55, 170, 231, 254, 222, 213, 255, 220, 255, 255, 128 },
-        {   8, 118, 184, 251, 200, 207, 255, 219, 255, 255, 128 },
-        {   2,  78, 126, 239, 156, 185, 251, 216, 255, 255, 128 },
-        {   1,  43,  68, 189, 108, 151, 247, 187, 255, 228, 128 },
-        {   1,  34,  40, 121, 114, 102, 205,  96, 255, 255, 128 }
+        { 143, 185, 245 },
+        { 133, 115, 231 },
+        { 114,  14, 184 },
+        {  77,   3, 126 },
+        {  43,   1,  68 },
+        {  34,   1,  40 }
       }, { /* Coeff Band 5 */
-        {  65, 228, 241, 255, 231, 214, 253, 222, 255, 255, 128 },
-        {  33, 173, 226, 254, 222, 216, 255, 231, 255, 255, 128 },
-        {   5, 120, 180, 251, 197, 205, 251, 226, 255, 233, 128 },
-        {   1,  81, 130, 240, 159, 187, 251, 206, 255, 205, 128 },
-        {   1,  51,  78, 198, 119, 168, 238, 181, 255, 171, 128 },
-        {   1,  18,  49, 183, 119, 160, 255, 171, 128, 128, 128 }
+        { 170, 194, 241 },
+        { 151,  80, 226 },
+        { 118,   9, 180 },
+        {  81,   1, 130 },
+        {  51,   1,  78 },
+        {  18,   1,  49 }
       }
     }
   }
 };
-static const vp9_coeff_probs default_coef_probs_32x32[BLOCK_TYPES] = {
+static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = {
   { /* block Type 0 */
     { /* Intra */
       { /* Coeff Band 0 */
-        {  37,  34, 137, 205, 154, 170, 151, 159, 109, 172,  44 },
-        {   3,  26,  60, 113, 123, 154, 100, 124, 152, 131, 144 },
-        {   1,  13,  23,  54, 102, 139,  71, 106, 146, 123, 148 }
+        {  29,  42, 137 },
+        {  26,   3,  60 },
+        {  13,   1,  23 }
       }, { /* Coeff Band 1 */
-        {  26,  77, 122, 152, 144, 160, 143, 129, 216, 158, 201 },
-        {  43,  76, 123, 152, 142, 159, 145, 129, 218, 160, 204 },
-        {  25,  67, 112, 150, 141, 159, 144, 128, 218, 159, 204 },
-        {   9,  54,  90, 143, 134, 156, 144, 127, 218, 159, 204 },
-        {   2,  32,  52, 116, 114, 148, 138, 123, 217, 158, 207 },
-        {   1,  10,  15,  44,  91, 133,  75,  99, 172, 128, 169 }
+        {  69,  36, 122 },
+        {  63,  57, 123 },
+        {  60,  33, 112 },
+        {  52,  11,  90 },
+        {  32,   2,  52 },
+        {  10,   1,  15 }
       }, { /* Coeff Band 2 */
-        {  32, 122, 143, 163, 145, 161, 162, 131, 226, 171, 206 },
-        {  46, 105, 143, 168, 148, 161, 165, 133, 228, 174, 204 },
-        {  17,  79, 116, 164, 142, 161, 166, 134, 229, 174, 206 },
-        {   4,  53,  78, 143, 125, 153, 163, 129, 232, 175, 213 },
-        {   1,  29,  44, 105, 105, 142, 147, 120, 228, 168, 211 },
-        {   1,  12,  18,  52,  91, 133,  92, 100, 193, 140, 183 }
+        { 107,  55, 143 },
+        {  86,  69, 143 },
+        {  74,  24, 116 },
+        {  52,   5,  78 },
+        {  29,   1,  44 },
+        {  12,   1,  18 }
       }, { /* Coeff Band 3 */
-        {  33, 157, 160, 182, 149, 163, 185, 141, 236, 185, 218 },
-        {  20, 116, 152, 188, 152, 165, 191, 144, 238, 188, 217 },
-        {   4,  74, 114, 180, 141, 162, 192, 143, 240, 191, 219 },
-        {   1,  44,  69, 148, 119, 151, 183, 134, 243, 192, 227 },
-        {   1,  25,  40, 110, 101, 141, 162, 121, 238, 181, 223 },
-        {   1,  12,  18,  56,  89, 132, 103, 101, 206, 148, 196 }
+        { 137,  71, 160 },
+        { 107,  34, 152 },
+        {  73,   6, 114 },
+        {  44,   1,  69 },
+        {  25,   1,  40 },
+        {  12,   1,  18 }
       }, { /* Coeff Band 4 */
-        {  25, 183, 174, 207, 159, 171, 205, 156, 243, 194, 228 },
-        {  13, 124, 159, 209, 157, 171, 213, 160, 243, 200, 228 },
-        {   2,  75, 117, 199, 143, 166, 215, 158, 246, 205, 230 },
-        {   1,  45,  73, 165, 119, 153, 204, 144, 248, 205, 231 },
-        {   1,  26,  43, 120, 101, 141, 178, 127, 242, 192, 226 },
-        {   1,  12,  19,  59,  89, 132, 112, 102, 215, 154, 201 }
+        { 165,  70, 174 },
+        { 118,  24, 159 },
+        {  74,   3, 117 },
+        {  45,   1,  73 },
+        {  26,   1,  43 },
+        {  12,   1,  19 }
       }, { /* Coeff Band 5 */
-        {  13, 232, 223, 239, 196, 188, 225, 172, 248, 209, 226 },
-        {   4, 155, 187, 237, 184, 187, 233, 180, 250, 216, 232 },
-        {   1,  86, 131, 222, 156, 175, 233, 176, 251, 218, 237 },
-        {   1,  49,  79, 181, 123, 157, 218, 155, 251, 214, 237 },
-        {   1,  26,  43, 125, 100, 141, 188, 130, 246, 199, 231 },
-        {   1,  12,  20,  62,  88, 131, 119, 102, 222, 161, 209 }
+        { 220,  93, 223 },
+        { 153,  10, 187 },
+        {  86,   2, 131 },
+        {  49,   1,  79 },
+        {  26,   1,  43 },
+        {  12,   1,  20 }
       }
     }, { /* Inter */
       { /* Coeff Band 0 */
-        {  51,  37, 227, 237, 205, 184, 200, 162, 231, 187, 207 },
-        {   9,  36, 172, 204, 176, 173, 171, 145, 217, 167, 197 },
-        {  21,  26, 112, 162, 145, 162, 155, 133, 215, 165, 191 }
+        {  30,  58, 227 },
+        {  35,  10, 172 },
+        {  24,  23, 112 }
       }, { /* Coeff Band 1 */
-        {  79, 169, 219, 223, 176, 177, 222, 161, 248, 213, 244 },
-        { 177, 166, 216, 222, 175, 178, 222, 161, 246, 212, 226 },
-        { 119, 141, 196, 222, 174, 176, 220, 163, 250, 212, 236 },
-        {  63, 117, 165, 217, 163, 175, 218, 161, 248, 209, 231 },
-        {  30,  87, 117, 192, 138, 162, 216, 157, 247, 211, 224 },
-        {  14,  56,  60, 119, 111, 146, 156, 123, 227, 171, 220 }
+        { 117, 145, 219 },
+        {  51, 221, 216 },
+        {  75, 169, 196 },
+        {  88,  96, 165 },
+        {  77,  43, 117 },
+        {  53,  18,  60 }
       }, { /* Coeff Band 2 */
-        {  88, 195, 225, 229, 181, 181, 229, 171, 252, 212, 221 },
-        {  66, 145, 202, 229, 177, 180, 230, 172, 253, 220, 255 },
-        {  12,  97, 152, 221, 162, 174, 230, 169, 253, 218, 249 },
-        {   3,  66, 103, 198, 138, 165, 223, 159, 253, 219, 251 },
-        {   1,  38,  61, 158, 110, 148, 209, 146, 252, 212, 238 },
-        {   1,  19,  30,  94,  94, 136, 160, 114, 244, 185, 236 }
+        { 128, 176, 225 },
+        { 108, 114, 202 },
+        {  92,  19, 152 },
+        {  65,   4, 103 },
+        {  38,   1,  61 },
+        {  19,   1,  30 }
       }, { /* Coeff Band 3 */
-        {  79, 211, 228, 235, 186, 184, 233, 176, 255, 225, 255 },
-        {  50, 151, 205, 235, 182, 185, 237, 177, 254, 223, 255 },
-        {   7,  95, 149, 225, 162, 176, 236, 177, 254, 229, 219 },
-        {   1,  62,  98, 198, 134, 164, 228, 162, 254, 224, 238 },
-        {   1,  35,  57, 156, 108, 148, 211, 143, 253, 215, 238 },
-        {   1,  17,  26,  87,  89, 135, 161, 113, 246, 189, 237 }
+        { 146, 184, 228 },
+        { 122,  95, 205 },
+        {  92,  11, 149 },
+        {  62,   1,  98 },
+        {  35,   1,  57 },
+        {  17,   1,  26 }
       }, { /* Coeff Band 4 */
-        {  68, 225, 230, 239, 190, 187, 238, 180, 252, 234, 255 },
-        {  39, 156, 206, 239, 185, 187, 241, 187, 254, 231, 255 },
-        {   4,  94, 147, 229, 163, 178, 242, 183, 255, 236, 224 },
-        {   1,  58,  94, 200, 132, 163, 232, 166, 254, 230, 255 },
-        {   1,  32,  52, 153, 104, 146, 214, 144, 253, 222, 236 },
-        {   1,  15,  24,  84,  89, 131, 159, 109, 247, 192, 240 }
+        { 165, 192, 230 },
+        { 132,  81, 206 },
+        {  93,   6, 147 },
+        {  58,   1,  94 },
+        {  32,   1,  52 },
+        {  15,   1,  24 }
       }, { /* Coeff Band 5 */
-        {  45, 248, 234, 248, 208, 198, 244, 193, 255, 233, 255 },
-        {  19, 169, 204, 246, 195, 195, 246, 199, 255, 233, 255 },
-        {   2,  98, 145, 235, 166, 183, 245, 192, 255, 235, 255 },
-        {   1,  59,  92, 205, 131, 164, 236, 172, 254, 231, 250 },
-        {   1,  33,  52, 152, 103, 145, 216, 144, 253, 221, 240 },
-        {   1,  15,  24,  83,  87, 133, 156, 110, 246, 191, 242 }
+        { 204, 223, 234 },
+        { 156,  49, 204 },
+        {  97,   3, 145 },
+        {  59,   1,  92 },
+        {  33,   1,  52 },
+        {  15,   1,  24 }
       }
     }
   }, { /* block Type 1 */
     { /* Intra */
       { /* Coeff Band 0 */
-        { 179,  23, 200, 222, 180, 182, 150, 152, 148, 135, 125 },
-        {  60,  33, 113, 185, 143, 166, 168, 144, 189, 168, 152 },
-        {   8,  31,  59, 137, 114, 150, 163, 132, 206, 171, 169 }
+        {   7, 184, 200 },
+        {  25,  67, 113 },
+        {  30,   9,  59 }
       }, { /* Coeff Band 1 */
-        {  27, 103, 158, 215, 157, 174, 209, 165, 239, 191, 233 },
-        {  90, 101, 159, 213, 156, 173, 212, 164, 230, 185, 237 },
-        {  39,  91, 146, 212, 155, 169, 212, 165, 232, 186, 207 },
-        {  16,  75, 120, 203, 144, 169, 210, 161, 233, 189, 227 },
-        {   3,  48,  76, 167, 120, 154, 199, 146, 236, 190, 218 },
-        {   1,  18,  26,  72,  95, 137, 113, 109, 197, 146, 186 }
+        {  92,  42, 158 },
+        {  65, 121, 159 },
+        {  77,  56, 146 },
+        {  70,  22, 120 },
+        {  47,   4,  76 },
+        {  18,   1,  26 }
       }, { /* Coeff Band 2 */
-        {  45, 137, 177, 218, 166, 174, 206, 163, 234, 184, 214 },
-        {  47, 117, 167, 218, 166, 176, 206, 164, 234, 182, 229 },
-        {  16,  90, 136, 211, 153, 172, 205, 162, 236, 192, 231 },
-        {   6,  65, 100, 188, 136, 162, 193, 155, 237, 177, 228 },
-        {   1,  37,  58, 137, 113, 150, 166, 134, 229, 167, 234 },
-        {   1,  13,  19,  55,  90, 132,  93, 103, 196, 137, 202 }
+        { 113,  81, 177 },
+        {  96,  75, 167 },
+        {  84,  24, 136 },
+        {  63,   8, 100 },
+        {  37,   1,  58 },
+        {  13,   1,  19 }
       }, { /* Coeff Band 3 */
-        {  36, 171, 194, 227, 177, 179, 208, 165, 244, 196, 245 },
-        {  19, 129, 178, 227, 175, 184, 214, 165, 246, 188, 255 },
-        {   5,  90, 139, 217, 158, 174, 213, 166, 246, 198, 255 },
-        {   1,  59,  93, 182, 134, 162, 193, 150, 242, 188, 241 },
-        {   1,  31,  49, 122, 108, 145, 160, 127, 235, 172, 229 },
-        {   1,  10,  18,  54,  89, 132, 101,  99, 213, 144, 217 }
+        { 147,  85, 194 },
+        { 119,  36, 178 },
+        {  88,   8, 139 },
+        {  59,   1,  93 },
+        {  31,   1,  49 },
+        {  10,   1,  18 }
       }, { /* Coeff Band 4 */
-        {  37, 197, 210, 233, 187, 186, 216, 172, 250, 202, 255 },
-        {  20, 142, 191, 234, 183, 186, 219, 170, 249, 207, 246 },
-        {   3,  93, 144, 222, 163, 176, 219, 170, 249, 204, 224 },
-        {   1,  56,  88, 179, 130, 159, 199, 148, 246, 197, 243 },
-        {   1,  29,  47, 123, 104, 144, 172, 127, 244, 185, 234 },
-        {   1,  14,  22,  66,  91, 130, 120, 103, 225, 158, 221 }
+        { 169, 108, 210 },
+        { 131,  41, 191 },
+        {  92,   5, 144 },
+        {  56,   1,  88 },
+        {  29,   1,  47 },
+        {  14,   1,  22 }
       }, { /* Coeff Band 5 */
-        {  19, 227, 223, 245, 203, 194, 238, 187, 251, 225, 217 },
-        {   6, 152, 192, 242, 189, 190, 241, 190, 253, 225, 255 },
-        {   1,  89, 138, 228, 161, 177, 239, 181, 254, 224, 248 },
-        {   1,  52,  84, 188, 127, 157, 224, 159, 253, 222, 247 },
-        {   1,  29,  47, 132, 102, 140, 196, 132, 251, 208, 244 },
-        {   1,  14,  23,  71,  90, 133, 134, 103, 239, 174, 233 }
+        { 210, 106, 223 },
+        { 148,  14, 192 },
+        {  89,   2, 138 },
+        {  52,   1,  84 },
+        {  29,   1,  47 },
+        {  14,   1,  23 }
       }
     }, { /* Inter */
       { /* Coeff Band 0 */
-        { 205,  14, 245, 235, 216, 189, 190, 146, 249, 201, 255 },
-        {  97,  19, 213, 210, 194, 174, 176, 139, 241, 183, 250 },
-        {  31,  20, 144, 183, 160, 167, 171, 132, 240, 184, 253 }
+        {   3, 207, 245 },
+        {  12, 102, 213 },
+        {  18,  33, 144 }
       }, { /* Coeff Band 1 */
-        { 137, 182, 245, 254, 221, 216, 255, 160, 128, 128, 128 },
-        { 231, 185, 242, 251, 218, 205, 255, 233, 128, 128, 128 },
-        { 170, 175, 229, 252, 205, 209, 255, 211, 128, 128, 128 },
-        { 107, 157, 213, 250, 199, 205, 251, 207, 255, 255, 128 },
-        {  77, 126, 183, 243, 182, 183, 252, 206, 255, 255, 128 },
-        {  69,  96, 149, 229, 157, 170, 247, 169, 255, 255, 128 }
+        {  85, 205, 245 },
+        {  18, 249, 242 },
+        {  59, 221, 229 },
+        {  91, 166, 213 },
+        {  88, 117, 183 },
+        {  70,  95, 149 }
       }, { /* Coeff Band 2 */
-        { 107, 196, 241, 252, 211, 208, 255, 210, 128, 128, 128 },
-        {  92, 162, 221, 249, 203, 195, 255, 199, 128, 128, 128 },
-        {  20, 108, 181, 244, 190, 191, 250, 200, 255, 255, 128 },
-        {   7,  80, 132, 241, 172, 197, 253, 191, 255, 255, 128 },
-        {   2,  43,  75, 219, 122, 150, 255, 203, 128, 128, 128 },
-        {   1,  15,  48,  98,  51, 192, 255, 160, 128, 128, 128 }
+        { 114, 193, 241 },
+        { 104, 155, 221 },
+        { 100,  33, 181 },
+        {  78,  10, 132 },
+        {  43,   2,  75 },
+        {  15,   1,  48 }
       }, { /* Coeff Band 3 */
-        { 107, 202, 244, 254, 226, 215, 255, 192, 128, 128, 128 },
-        {  77, 167, 224, 252, 215, 212, 255, 235, 128, 128, 128 },
-        {  14, 117, 179, 249, 191, 196, 255, 212, 128, 128, 128 },
-        {   3,  84, 134, 237, 160, 194, 248, 216, 255, 255, 128 },
-        {   1,  57,  84, 216, 145, 136, 255, 161, 128, 128, 128 },
-        {   1,   1,   1, 255, 128, 255, 128, 128, 128, 128, 128 }
+        { 118, 198, 244 },
+        { 117, 142, 224 },
+        { 111,  25, 179 },
+        {  83,   4, 134 },
+        {  57,   1,  84 },
+        {   1,   1,   1 }
       }, { /* Coeff Band 4 */
-        {  88, 219, 248, 255, 239, 225, 255, 255, 128, 128, 128 },
-        {  61, 178, 234, 255, 227, 227, 255, 217, 128, 128, 128 },
-        {   6, 127, 188, 252, 201, 211, 255, 244, 128, 128, 128 },
-        {   1,  83, 130, 248, 173, 197, 255, 175, 128, 128, 128 },
-        {   1,  61,  66, 211, 121, 188, 255, 213, 128, 128, 128 },
-        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+        { 144, 201, 248 },
+        { 136, 130, 234 },
+        { 124,  12, 188 },
+        {  83,   1, 130 },
+        {  61,   1,  66 },
+        {  64, 171, 128 }
       }, { /* Coeff Band 5 */
-        {  73, 243, 250, 255, 244, 220, 255, 205, 128, 128, 128 },
-        {  42, 197, 242, 255, 237, 227, 242, 166, 255, 255, 128 },
-        {  10, 137, 197, 252, 214, 199, 255, 238, 128, 128, 128 },
-        {   2,  85, 134, 242, 163, 185, 224, 238, 255, 255, 128 },
-        {   1,  70,  69, 199, 110,  64, 255, 213, 128, 128, 128 },
-        {   1,   1,   1,   1, 128, 128, 255,   1, 128, 128, 128 }
+        { 174, 227, 250 },
+        { 165, 118, 242 },
+        { 132,  21, 197 },
+        {  84,   3, 134 },
+        {  70,   1,  69 },
+        {   1,   1,   1 }
       }
     }
   }
 };
-
-#if CONFIG_CODE_NONZEROCOUNT
-
-// TODO(debargha): Remove the macro and count tables after experimentation
-#define NZC_DEFAULT_COUNTS  /* Uncomment to use counts as defaults */
-
-#ifdef NZC_DEFAULT_COUNTS
-static const unsigned int default_nzc_counts_4x4[MAX_NZC_CONTEXTS]
-                                                [REF_TYPES]
-                                                [BLOCK_TYPES]
-                                                [NZC4X4_TOKENS] = {
-  {
-    {
-      { 967652, 29023, 15039, 6952, 1568, 116 },
-      { 289116, 22938, 4522, 1935, 520, 47 }
-    }, {
-      { 967652, 29023, 15039, 6952, 1568, 116 },
-      { 689116, 22938, 4522, 1935, 520, 47 }
-    },
-  }, {
-    {
-      { 124684, 37167, 15270, 8483, 1777, 102 },
-      { 10405, 12395, 3401, 3574, 2461, 771 }
-    }, {
-      { 124684, 37167, 15270, 8483, 1777, 102 },
-      { 20405, 12395, 3401, 3574, 2461, 771 }
+#else
+static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = {
+  { /* block Type 0 */
+    { /* Intra */
+      { /* Coeff Band 0 */
+        { 195,  29, 183 },
+        {  84,  49, 136 },
+        {   8,  42,  71 }
+      }, { /* Coeff Band 1 */
+        {  31, 107, 169 },
+        {  35,  99, 159 },
+        {  17,  82, 140 },
+        {   8,  66, 114 },
+        {   2,  44,  76 },
+        {   1,  19,  32 }
+      }, { /* Coeff Band 2 */
+        {  40, 132, 201 },
+        {  29, 114, 187 },
+        {  13,  91, 157 },
+        {   7,  75, 127 },
+        {   3,  58,  95 },
+        {   1,  28,  47 }
+      }, { /* Coeff Band 3 */
+        {  69, 142, 221 },
+        {  42, 122, 201 },
+        {  15,  91, 159 },
+        {   6,  67, 121 },
+        {   1,  42,  77 },
+        {   1,  17,  31 }
+      }, { /* Coeff Band 4 */
+        { 102, 148, 228 },
+        {  67, 117, 204 },
+        {  17,  82, 154 },
+        {   6,  59, 114 },
+        {   2,  39,  75 },
+        {   1,  15,  29 }
+      }, { /* Coeff Band 5 */
+        { 156,  57, 233 },
+        { 119,  57, 212 },
+        {  58,  48, 163 },
+        {  29,  40, 124 },
+        {  12,  30,  81 },
+        {   3,  12,  31 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        { 191, 107, 226 },
+        { 124, 117, 204 },
+        {  25,  99, 155 }
+      }, { /* Coeff Band 1 */
+        {  29, 148, 210 },
+        {  37, 126, 194 },
+        {   8,  93, 157 },
+        {   2,  68, 118 },
+        {   1,  39,  69 },
+        {   1,  17,  33 }
+      }, { /* Coeff Band 2 */
+        {  41, 151, 213 },
+        {  27, 123, 193 },
+        {   3,  82, 144 },
+        {   1,  58, 105 },
+        {   1,  32,  60 },
+        {   1,  13,  26 }
+      }, { /* Coeff Band 3 */
+        {  59, 159, 220 },
+        {  23, 126, 198 },
+        {   4,  88, 151 },
+        {   1,  66, 114 },
+        {   1,  38,  71 },
+        {   1,  18,  34 }
+      }, { /* Coeff Band 4 */
+        { 114, 136, 232 },
+        {  51, 114, 207 },
+        {  11,  83, 155 },
+        {   3,  56, 105 },
+        {   1,  33,  65 },
+        {   1,  17,  34 }
+      }, { /* Coeff Band 5 */
+        { 149,  65, 234 },
+        { 121,  57, 215 },
+        {  61,  49, 166 },
+        {  28,  36, 114 },
+        {  12,  25,  76 },
+        {   3,  16,  42 }
+      }
     }
-  }, {
-    {
-      { 4100, 22976, 15627, 16137, 7982, 1793 },
-      { 4249, 3084, 2131, 4081, 6439, 1653 }
-    }, {
-      { 21100, 22976, 15627, 16137, 7982, 1793 },
-      { 4249, 3084, 2131, 4081, 2439, 1653 }
+  }, { /* block Type 1 */
+    { /* Intra */
+      { /* Coeff Band 0 */
+        { 214,  49, 220 },
+        { 132,  63, 188 },
+        {  42,  65, 137 }
+      }, { /* Coeff Band 1 */
+        {  85, 137, 221 },
+        { 104, 131, 216 },
+        {  49, 111, 192 },
+        {  21,  87, 155 },
+        {   2,  49,  87 },
+        {   1,  16,  28 }
+      }, { /* Coeff Band 2 */
+        {  89, 163, 230 },
+        {  90, 137, 220 },
+        {  29, 100, 183 },
+        {  10,  70, 135 },
+        {   2,  42,  81 },
+        {   1,  17,  33 }
+      }, { /* Coeff Band 3 */
+        { 108, 167, 237 },
+        {  55, 133, 222 },
+        {  15,  97, 179 },
+        {   4,  72, 135 },
+        {   1,  45,  85 },
+        {   1,  19,  38 }
+      }, { /* Coeff Band 4 */
+        { 124, 146, 240 },
+        {  66, 124, 224 },
+        {  17,  88, 175 },
+        {   4,  58, 122 },
+        {   1,  36,  75 },
+        {   1,  18,  37 }
+      }, { /* Coeff Band 5 */
+        { 141,  79, 241 },
+        { 126,  70, 227 },
+        {  66,  58, 182 },
+        {  30,  44, 136 },
+        {  12,  34,  96 },
+        {   2,  20,  47 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        { 229,  99, 249 },
+        { 143, 111, 235 },
+        {  46, 109, 192 }
+      }, { /* Coeff Band 1 */
+        {  82, 158, 236 },
+        {  94, 146, 224 },
+        {  25, 117, 191 },
+        {   9,  87, 149 },
+        {   3,  56,  99 },
+        {   1,  33,  57 }
+      }, { /* Coeff Band 2 */
+        {  83, 167, 237 },
+        {  68, 145, 222 },
+        {  10, 103, 177 },
+        {   2,  72, 131 },
+        {   1,  41,  79 },
+        {   1,  20,  39 }
+      }, { /* Coeff Band 3 */
+        {  99, 167, 239 },
+        {  47, 141, 224 },
+        {  10, 104, 178 },
+        {   2,  73, 133 },
+        {   1,  44,  85 },
+        {   1,  22,  47 }
+      }, { /* Coeff Band 4 */
+        { 127, 145, 243 },
+        {  71, 129, 228 },
+        {  17,  93, 177 },
+        {   3,  61, 124 },
+        {   1,  41,  84 },
+        {   1,  21,  52 }
+      }, { /* Coeff Band 5 */
+        { 157,  78, 244 },
+        { 140,  72, 231 },
+        {  69,  58, 184 },
+        {  31,  44, 137 },
+        {  14,  38, 105 },
+        {   8,  23,  61 }
+      }
     }
   }
 };
-
-static const unsigned int default_nzc_counts_8x8[MAX_NZC_CONTEXTS]
-                                                [REF_TYPES]
-                                                [BLOCK_TYPES]
-                                                [NZC8X8_TOKENS] = {
-  {
-    {
-      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10 },
-      { 72052, 30468, 6973, 3250, 1500, 750, 375, 5 },
-    }, {
-      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10 },
-      { 192052, 30468, 6973, 3250, 1500, 750, 375, 5 },
+static const vp9_coeff_probs_model default_coef_probs_8x8[BLOCK_TYPES] = {
+  { /* block Type 0 */
+    { /* Intra */
+      { /* Coeff Band 0 */
+        { 125,  34, 187 },
+        {  52,  41, 133 },
+        {   6,  31,  56 }
+      }, { /* Coeff Band 1 */
+        {  37, 109, 153 },
+        {  51, 102, 147 },
+        {  23,  87, 128 },
+        {   8,  67, 101 },
+        {   1,  41,  63 },
+        {   1,  19,  29 }
+      }, { /* Coeff Band 2 */
+        {  31, 154, 185 },
+        {  17, 127, 175 },
+        {   6,  96, 145 },
+        {   2,  73, 114 },
+        {   1,  51,  82 },
+        {   1,  28,  45 }
+      }, { /* Coeff Band 3 */
+        {  23, 163, 200 },
+        {  10, 131, 185 },
+        {   2,  93, 148 },
+        {   1,  67, 111 },
+        {   1,  41,  69 },
+        {   1,  14,  24 }
+      }, { /* Coeff Band 4 */
+        {  29, 176, 217 },
+        {  12, 145, 201 },
+        {   3, 101, 156 },
+        {   1,  69, 111 },
+        {   1,  39,  63 },
+        {   1,  14,  23 }
+      }, { /* Coeff Band 5 */
+        {  57, 192, 233 },
+        {  25, 154, 215 },
+        {   6, 109, 167 },
+        {   3,  78, 118 },
+        {   1,  48,  69 },
+        {   1,  21,  29 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        { 202, 105, 245 },
+        { 108, 106, 216 },
+        {  18,  90, 144 }
+      }, { /* Coeff Band 1 */
+        {  33, 172, 219 },
+        {  64, 149, 206 },
+        {  14, 117, 177 },
+        {   5,  90, 141 },
+        {   2,  61,  95 },
+        {   1,  37,  57 }
+      }, { /* Coeff Band 2 */
+        {  33, 179, 220 },
+        {  11, 140, 198 },
+        {   1,  89, 148 },
+        {   1,  60, 104 },
+        {   1,  33,  57 },
+        {   1,  12,  21 }
+      }, { /* Coeff Band 3 */
+        {  30, 181, 221 },
+        {   8, 141, 198 },
+        {   1,  87, 145 },
+        {   1,  58, 100 },
+        {   1,  31,  55 },
+        {   1,  12,  20 }
+      }, { /* Coeff Band 4 */
+        {  32, 186, 224 },
+        {   7, 142, 198 },
+        {   1,  86, 143 },
+        {   1,  58, 100 },
+        {   1,  31,  55 },
+        {   1,  12,  22 }
+      }, { /* Coeff Band 5 */
+        {  57, 192, 227 },
+        {  20, 143, 204 },
+        {   3,  96, 154 },
+        {   1,  68, 112 },
+        {   1,  42,  69 },
+        {   1,  19,  32 }
+      }
     }
-  }, {
-    {
-      { 121533, 33527, 15655, 11920, 5723, 2009, 315, 7 },
-      { 23772, 23120, 13127, 8115, 4000, 2000, 200, 6 },
-    }, {
-      { 121533, 33527, 15655, 11920, 5723, 2009, 315, 7 },
-      { 23772, 23120, 13127, 8115, 4000, 2000, 200, 6 },
+  }, { /* block Type 1 */
+    { /* Intra */
+      { /* Coeff Band 0 */
+        { 212,  35, 215 },
+        { 113,  47, 169 },
+        {  29,  48, 105 }
+      }, { /* Coeff Band 1 */
+        {  74, 129, 203 },
+        { 106, 120, 203 },
+        {  49, 107, 178 },
+        {  19,  84, 144 },
+        {   4,  50,  84 },
+        {   1,  15,  25 }
+      }, { /* Coeff Band 2 */
+        {  71, 172, 217 },
+        {  44, 141, 209 },
+        {  15, 102, 173 },
+        {   6,  76, 133 },
+        {   2,  51,  89 },
+        {   1,  24,  42 }
+      }, { /* Coeff Band 3 */
+        {  64, 185, 231 },
+        {  31, 148, 216 },
+        {   8, 103, 175 },
+        {   3,  74, 131 },
+        {   1,  46,  81 },
+        {   1,  18,  30 }
+      }, { /* Coeff Band 4 */
+        {  65, 196, 235 },
+        {  25, 157, 221 },
+        {   5, 105, 174 },
+        {   1,  67, 120 },
+        {   1,  38,  69 },
+        {   1,  15,  30 }
+      }, { /* Coeff Band 5 */
+        {  65, 204, 238 },
+        {  30, 156, 224 },
+        {   7, 107, 177 },
+        {   2,  70, 124 },
+        {   1,  42,  73 },
+        {   1,  18,  34 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        { 225,  86, 251 },
+        { 144, 104, 235 },
+        {  42,  99, 181 }
+      }, { /* Coeff Band 1 */
+        {  85, 175, 239 },
+        { 112, 165, 229 },
+        {  29, 136, 200 },
+        {  12, 103, 162 },
+        {   6,  77, 123 },
+        {   2,  53,  84 }
+      }, { /* Coeff Band 2 */
+        {  75, 183, 239 },
+        {  30, 155, 221 },
+        {   3, 106, 171 },
+        {   1,  74, 128 },
+        {   1,  44,  76 },
+        {   1,  17,  28 }
+      }, { /* Coeff Band 3 */
+        {  73, 185, 240 },
+        {  27, 159, 222 },
+        {   2, 107, 172 },
+        {   1,  75, 127 },
+        {   1,  42,  73 },
+        {   1,  17,  29 }
+      }, { /* Coeff Band 4 */
+        {  62, 190, 238 },
+        {  21, 159, 222 },
+        {   2, 107, 172 },
+        {   1,  72, 122 },
+        {   1,  40,  71 },
+        {   1,  18,  32 }
+      }, { /* Coeff Band 5 */
+        {  61, 199, 240 },
+        {  27, 161, 226 },
+        {   4, 113, 180 },
+        {   1,  76, 129 },
+        {   1,  46,  80 },
+        {   1,  23,  41 }
+      }
     }
-  }, {
-    {
-      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17 },
-      { 11612, 13874, 13329, 13022, 6500, 3250, 300, 12 },
-    }, {
-      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17 },
-      { 11612, 13874, 13329, 13022, 6500, 3250, 300, 12 },
-    }
   }
 };
-
-static const unsigned int default_nzc_counts_16x16[MAX_NZC_CONTEXTS]
-                                                  [REF_TYPES]
-                                                  [BLOCK_TYPES]
-                                                  [NZC16X16_TOKENS] = {
-  {
-    {
-      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2 },
-      { 72052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1 },
-    }, {
-      { 32988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2 },
-      { 92052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1 },
+static const vp9_coeff_probs_model default_coef_probs_16x16[BLOCK_TYPES] = {
+  { /* block Type 0 */
+    { /* Intra */
+      { /* Coeff Band 0 */
+        {   7,  27, 153 },
+        {   5,  30,  95 },
+        {   1,  16,  30 }
+      }, { /* Coeff Band 1 */
+        {  50,  75, 127 },
+        {  57,  75, 124 },
+        {  27,  67, 108 },
+        {  10,  54,  86 },
+        {   1,  33,  52 },
+        {   1,  12,  18 }
+      }, { /* Coeff Band 2 */
+        {  43, 125, 151 },
+        {  26, 108, 148 },
+        {   7,  83, 122 },
+        {   2,  59,  89 },
+        {   1,  38,  60 },
+        {   1,  17,  27 }
+      }, { /* Coeff Band 3 */
+        {  23, 144, 163 },
+        {  13, 112, 154 },
+        {   2,  75, 117 },
+        {   1,  50,  81 },
+        {   1,  31,  51 },
+        {   1,  14,  23 }
+      }, { /* Coeff Band 4 */
+        {  18, 162, 185 },
+        {   6, 123, 171 },
+        {   1,  78, 125 },
+        {   1,  51,  86 },
+        {   1,  31,  54 },
+        {   1,  14,  23 }
+      }, { /* Coeff Band 5 */
+        {  15, 199, 227 },
+        {   3, 150, 204 },
+        {   1,  91, 146 },
+        {   1,  55,  95 },
+        {   1,  30,  53 },
+        {   1,  11,  20 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        {  19,  55, 240 },
+        {  19,  59, 196 },
+        {   3,  52, 105 }
+      }, { /* Coeff Band 1 */
+        {  41, 166, 207 },
+        { 104, 153, 199 },
+        {  31, 123, 181 },
+        {  14, 101, 152 },
+        {   5,  72, 106 },
+        {   1,  36,  52 }
+      }, { /* Coeff Band 2 */
+        {  35, 176, 211 },
+        {  12, 131, 190 },
+        {   2,  88, 144 },
+        {   1,  60, 101 },
+        {   1,  36,  60 },
+        {   1,  16,  28 }
+      }, { /* Coeff Band 3 */
+        {  28, 183, 213 },
+        {   8, 134, 191 },
+        {   1,  86, 142 },
+        {   1,  56,  96 },
+        {   1,  30,  53 },
+        {   1,  12,  20 }
+      }, { /* Coeff Band 4 */
+        {  20, 190, 215 },
+        {   4, 135, 192 },
+        {   1,  84, 139 },
+        {   1,  53,  91 },
+        {   1,  28,  49 },
+        {   1,  11,  20 }
+      }, { /* Coeff Band 5 */
+        {  13, 196, 216 },
+        {   2, 137, 192 },
+        {   1,  86, 143 },
+        {   1,  57,  99 },
+        {   1,  32,  56 },
+        {   1,  13,  24 }
+      }
     }
-  }, {
-    {
-      { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2 },
-      { 47772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2 },
-    }, {
-      { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2 },
-      { 27772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2 },
+  }, { /* block Type 1 */
+    { /* Intra */
+      { /* Coeff Band 0 */
+        { 211,  29, 217 },
+        {  96,  47, 156 },
+        {  22,  43,  87 }
+      }, { /* Coeff Band 1 */
+        {  78, 120, 193 },
+        { 111, 116, 186 },
+        {  46, 102, 164 },
+        {  15,  80, 128 },
+        {   2,  49,  76 },
+        {   1,  18,  28 }
+      }, { /* Coeff Band 2 */
+        {  71, 161, 203 },
+        {  42, 132, 192 },
+        {  10,  98, 150 },
+        {   3,  69, 109 },
+        {   1,  44,  70 },
+        {   1,  18,  29 }
+      }, { /* Coeff Band 3 */
+        {  57, 186, 211 },
+        {  30, 140, 196 },
+        {   4,  93, 146 },
+        {   1,  62, 102 },
+        {   1,  38,  65 },
+        {   1,  16,  27 }
+      }, { /* Coeff Band 4 */
+        {  47, 199, 217 },
+        {  14, 145, 196 },
+        {   1,  88, 142 },
+        {   1,  57,  98 },
+        {   1,  36,  62 },
+        {   1,  15,  26 }
+      }, { /* Coeff Band 5 */
+        {  26, 219, 229 },
+        {   5, 155, 207 },
+        {   1,  94, 151 },
+        {   1,  60, 104 },
+        {   1,  36,  62 },
+        {   1,  16,  28 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        { 233,  29, 248 },
+        { 146,  47, 220 },
+        {  43,  52, 140 }
+      }, { /* Coeff Band 1 */
+        { 100, 163, 232 },
+        { 179, 161, 222 },
+        {  63, 142, 204 },
+        {  37, 113, 174 },
+        {  26,  89, 137 },
+        {  18,  68,  97 }
+      }, { /* Coeff Band 2 */
+        {  85, 181, 230 },
+        {  32, 146, 209 },
+        {   7, 100, 164 },
+        {   3,  71, 121 },
+        {   1,  45,  77 },
+        {   1,  18,  30 }
+      }, { /* Coeff Band 3 */
+        {  65, 187, 230 },
+        {  20, 148, 207 },
+        {   2,  97, 159 },
+        {   1,  68, 116 },
+        {   1,  40,  70 },
+        {   1,  14,  29 }
+      }, { /* Coeff Band 4 */
+        {  40, 194, 227 },
+        {   8, 147, 204 },
+        {   1,  94, 155 },
+        {   1,  65, 112 },
+        {   1,  39,  66 },
+        {   1,  14,  26 }
+      }, { /* Coeff Band 5 */
+        {  16, 208, 228 },
+        {   3, 151, 207 },
+        {   1,  98, 160 },
+        {   1,  67, 117 },
+        {   1,  41,  74 },
+        {   1,  17,  31 }
+      }
     }
-  }, {
-    {
-      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5 },
-      { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3 },
-    }, {
-      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5 },
-      { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3 },
-    }
   }
 };
-
-static const unsigned int default_nzc_counts_32x32[MAX_NZC_CONTEXTS]
-                                                  [REF_TYPES]
-                                                  [BLOCK_TYPES]
-                                                  [NZC32X32_TOKENS] = {
-  {
-    {
-      { 72988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2, 1, 0 },
-      { 52052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1, 0, 0 },
-    }, {
-      { 72988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2, 1, 0 },
-      { 72052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1, 0, 0 },
+static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = {
+  { /* block Type 0 */
+    { /* Intra */
+      { /* Coeff Band 0 */
+        {  17,  38, 140 },
+        {   7,  34,  80 },
+        {   1,  17,  29 }
+      }, { /* Coeff Band 1 */
+        {  37,  75, 128 },
+        {  41,  76, 128 },
+        {  26,  66, 116 },
+        {  12,  52,  94 },
+        {   2,  32,  55 },
+        {   1,  10,  16 }
+      }, { /* Coeff Band 2 */
+        {  50, 127, 154 },
+        {  37, 109, 152 },
+        {  16,  82, 121 },
+        {   5,  59,  85 },
+        {   1,  35,  54 },
+        {   1,  13,  20 }
+      }, { /* Coeff Band 3 */
+        {  40, 142, 167 },
+        {  17, 110, 157 },
+        {   2,  71, 112 },
+        {   1,  44,  72 },
+        {   1,  27,  45 },
+        {   1,  11,  17 }
+      }, { /* Coeff Band 4 */
+        {  30, 175, 188 },
+        {   9, 124, 169 },
+        {   1,  74, 116 },
+        {   1,  48,  78 },
+        {   1,  30,  49 },
+        {   1,  11,  18 }
+      }, { /* Coeff Band 5 */
+        {  10, 222, 223 },
+        {   2, 150, 194 },
+        {   1,  83, 128 },
+        {   1,  48,  79 },
+        {   1,  27,  45 },
+        {   1,  11,  17 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        {  36,  41, 235 },
+        {  29,  36, 193 },
+        {  10,  27, 111 }
+      }, { /* Coeff Band 1 */
+        {  85, 165, 222 },
+        { 177, 162, 215 },
+        { 110, 135, 195 },
+        {  57, 113, 168 },
+        {  23,  83, 120 },
+        {  10,  49,  61 }
+      }, { /* Coeff Band 2 */
+        {  85, 190, 223 },
+        {  36, 139, 200 },
+        {   5,  90, 146 },
+        {   1,  60, 103 },
+        {   1,  38,  65 },
+        {   1,  18,  30 }
+      }, { /* Coeff Band 3 */
+        {  72, 202, 223 },
+        {  23, 141, 199 },
+        {   2,  86, 140 },
+        {   1,  56,  97 },
+        {   1,  36,  61 },
+        {   1,  16,  27 }
+      }, { /* Coeff Band 4 */
+        {  55, 218, 225 },
+        {  13, 145, 200 },
+        {   1,  86, 141 },
+        {   1,  57,  99 },
+        {   1,  35,  61 },
+        {   1,  13,  22 }
+      }, { /* Coeff Band 5 */
+        {  15, 235, 212 },
+        {   1, 132, 184 },
+        {   1,  84, 139 },
+        {   1,  57,  97 },
+        {   1,  34,  56 },
+        {   1,  14,  23 }
+      }
     }
-  }, {
-    {
-      { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2, 1, 0 },
-      { 27772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2, 1, 0 },
-    }, {
-      { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2, 1, 0 },
-      { 27772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2, 1, 0 },
+  }, { /* block Type 1 */
+    { /* Intra */
+      { /* Coeff Band 0 */
+        { 181,  21, 201 },
+        {  61,  37, 123 },
+        {  10,  38,  71 }
+      }, { /* Coeff Band 1 */
+        {  47, 106, 172 },
+        {  95, 104, 173 },
+        {  42,  93, 159 },
+        {  18,  77, 131 },
+        {   4,  50,  81 },
+        {   1,  17,  23 }
+      }, { /* Coeff Band 2 */
+        {  62, 147, 199 },
+        {  44, 130, 189 },
+        {  28, 102, 154 },
+        {  18,  75, 115 },
+        {   2,  44,  65 },
+        {   1,  12,  19 }
+      }, { /* Coeff Band 3 */
+        {  55, 153, 210 },
+        {  24, 130, 194 },
+        {   3,  93, 146 },
+        {   1,  61,  97 },
+        {   1,  31,  50 },
+        {   1,  10,  16 }
+      }, { /* Coeff Band 4 */
+        {  49, 186, 223 },
+        {  17, 148, 204 },
+        {   1,  96, 142 },
+        {   1,  53,  83 },
+        {   1,  26,  44 },
+        {   1,  11,  17 }
+      }, { /* Coeff Band 5 */
+        {  13, 217, 212 },
+        {   2, 136, 180 },
+        {   1,  78, 124 },
+        {   1,  50,  83 },
+        {   1,  29,  49 },
+        {   1,  14,  23 }
+      }
+    }, { /* Inter */
+      { /* Coeff Band 0 */
+        { 197,  13, 247 },
+        {  82,  17, 222 },
+        {  25,  17, 162 }
+      }, { /* Coeff Band 1 */
+        { 126, 186, 247 },
+        { 234, 191, 243 },
+        { 176, 177, 234 },
+        { 104, 158, 220 },
+        {  66, 128, 186 },
+        {  55,  90, 137 }
+      }, { /* Coeff Band 2 */
+        { 111, 197, 242 },
+        {  46, 158, 219 },
+        {   9, 104, 171 },
+        {   2,  65, 125 },
+        {   1,  44,  80 },
+        {   1,  17,  91 }
+      }, { /* Coeff Band 3 */
+        { 104, 208, 245 },
+        {  39, 168, 224 },
+        {   3, 109, 162 },
+        {   1,  79, 124 },
+        {   1,  50, 102 },
+        {   1,  43, 102 }
+      }, { /* Coeff Band 4 */
+        {  84, 220, 246 },
+        {  31, 177, 231 },
+        {   2, 115, 180 },
+        {   1,  79, 134 },
+        {   1,  55,  77 },
+        {   1,  60,  79 }
+      }, { /* Coeff Band 5 */
+        {  43, 243, 240 },
+        {   8, 180, 217 },
+        {   1, 115, 166 },
+        {   1,  84, 121 },
+        {   1,  51,  67 },
+        {   1,  16,   6 }
+      }
     }
-  }, {
-    {
-      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5, 2, 1 },
-      { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3, 2, 1 },
-    }, {
-      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5, 2, 1 },
-      { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3, 2, 1 },
-    }
   }
 };
-
-#else
-
-static const vp9_prob default_nzc_probs_4x4[MAX_NZC_CONTEXTS]
-                                           [REF_TYPES]
-                                           [BLOCK_TYPES]
-                                           [NZC4X4_TOKENS] = {
-  {
-    {
-      { 219, 162, 179, 142, 242, },
-      { 214, 253, 228, 246, 255, },
-    }, {
-      { 225, 236, 190, 229, 253, },
-      { 251, 253, 240, 248, 255, },
-    },
-  }, {
-    {
-      { 106, 126, 158, 126, 244, },
-      { 118, 241, 201, 240, 255, },
-    }, {
-      { 165, 179, 143, 189, 242, },
-      { 173, 239, 192, 255, 128, },
-    },
-  }, {
-    {
-      { 42 , 78 , 153, 92 , 223, },
-      { 128, 128, 128, 128, 128, },
-    }, {
-      { 76 , 68 , 126, 110, 216, },
-      { 128, 128, 128, 128, 128, },
-    },
-  },
-};
-
-static const vp9_prob default_nzc_probs_8x8[MAX_NZC_CONTEXTS]
-                                           [REF_TYPES]
-                                           [BLOCK_TYPES]
-                                           [NZC8X8_TOKENS] = {
-  {
-    {
-      { 134, 139, 170, 178, 142, 197, 255, },
-      { 167, 224, 199, 252, 205, 255, 128, },
-    }, {
-      { 181, 210, 180, 241, 190, 235, 255, },
-      { 234, 251, 235, 252, 219, 255, 128, },
-    },
-  }, {
-    {
-      { 33 , 64 , 155, 143, 86 , 216, 255, },
-      { 73 , 160, 167, 251, 153, 255, 128, },
-    }, {
-      { 79 , 104, 153, 195, 119, 246, 255, },
-      { 149, 183, 186, 249, 203, 255, 128, },
-    },
-  }, {
-    {
-      { 10 , 25 , 156, 61 , 69 , 156, 254, },
-      { 32 , 1  , 128, 146, 64 , 255, 128, },
-    }, {
-      { 37 , 48 , 143, 113, 81 , 202, 255, },
-      { 1  , 255, 128, 128, 128, 128, 128, },
-    },
-  },
-};
-
-static const vp9_prob default_nzc_probs_16x16[MAX_NZC_CONTEXTS]
-                                             [REF_TYPES]
-                                             [BLOCK_TYPES]
-                                             [NZC16X16_TOKENS] = {
-  {
-    {
-      { 11 , 188, 210, 167, 141, 143, 152, 255, 128, },
-      { 171, 201, 203, 244, 207, 255, 255, 128, 128, },
-    }, {
-      { 23 , 217, 207, 251, 198, 255, 219, 128, 128, },
-      { 235, 249, 229, 255, 199, 128, 128, 128, 128, },
-    },
-  }, {
-    {
-      { 9  , 45 , 168, 85 , 66 , 221, 139, 246, 255, },
-      { 51 , 110, 163, 238, 94 , 255, 255, 128, 128, },
-    }, {
-      { 4  , 149, 175, 240, 149, 255, 205, 128, 128, },
-      { 141, 217, 186, 255, 128, 128, 128, 128, 128, },
-    },
-  }, {
-    {
-      { 1  , 12 , 173, 6  , 68 , 145, 41 , 204, 255, },
-      { 39 , 47 , 128, 199, 110, 255, 128, 128, 128, },
-    }, {
-      { 1  , 121, 171, 149, 115, 242, 159, 255, 128, },
-      { 1  , 255, 255, 128, 128, 128, 128, 128, 128, },
-    },
-  },
-};
-
-static const vp9_prob default_nzc_probs_32x32[MAX_NZC_CONTEXTS]
-                                             [REF_TYPES]
-                                             [BLOCK_TYPES]
-                                             [NZC32X32_TOKENS] = {
-  {
-    {
-      { 11 , 216, 195, 201, 160, 247, 217, 255, 255, 128, 128, },
-      { 177, 240, 239, 255, 192, 128, 128, 128, 128, 128, 128, },
-    }, {
-      { 48 , 235, 213, 235, 199, 255, 255, 128, 128, 128, 128, },
-      { 205, 255, 248, 128, 128, 128, 128, 128, 128, 128, 128, },
-    },
-  }, {
-    {
-      { 6  , 96 , 138, 99 , 125, 248, 188, 255, 128, 128, 128, },
-      { 17 , 53 , 43 , 189, 1  , 255, 171, 128, 128, 128, 128, },
-    }, {
-      { 5  , 187, 235, 232, 117, 255, 219, 128, 128, 128, 128, },
-      { 146, 255, 255, 128, 128, 128, 128, 128, 128, 128, 128, },
-    },
-  }, {
-    {
-      { 1  , 7  , 93 , 14 , 100, 30 , 85 , 65 , 81 , 210, 255, },
-      { 1  , 1  , 128, 26 , 1  , 218, 78 , 255, 255, 128, 128, },
-    }, {
-      { 4  , 148, 206, 137, 160, 255, 255, 128, 128, 128, 128, },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, },
-    },
-  },
-};
 #endif
-
-static const vp9_prob default_nzc_pcat_probs[MAX_NZC_CONTEXTS]
-                                            [NZC_TOKENS_EXTRA]
-                                            [NZC_BITS_EXTRA] = {
-  // Bit probabilities are in least to most significance order
-  {
-    {176, 128, 128, 128, 128, 128, 128, 128, 128},   // 3 - 4
-    {164, 192, 128, 128, 128, 128, 128, 128, 128},   // 5 - 8
-    {154, 184, 208, 128, 128, 128, 128, 128, 128},   // 9 - 16
-    {144, 176, 200, 216, 128, 128, 128, 128, 128},   // 17 - 32
-    {140, 172, 192, 208, 224, 128, 128, 128, 128},   // 33 - 64
-    {136, 168, 188, 200, 220, 232, 128, 128, 128},   // 65 - 128
-    {132, 164, 184, 196, 216, 228, 240, 128, 128},   // 129 - 256
-    {130, 162, 178, 194, 212, 226, 240, 248, 128},   // 257 - 512
-    {128, 160, 176, 192, 208, 224, 240, 248, 254},   // 513 - 1024
-  }, {
-    {168, 128, 128, 128, 128, 128, 128, 128, 128},   // 3 - 4
-    {152, 184, 128, 128, 128, 128, 128, 128, 128},   // 5 - 8
-    {152, 184, 208, 128, 128, 128, 128, 128, 128},   // 9 - 16
-    {144, 176, 200, 216, 128, 128, 128, 128, 128},   // 17 - 32
-    {140, 172, 192, 208, 224, 128, 128, 128, 128},   // 33 - 64
-    {136, 168, 188, 200, 220, 232, 128, 128, 128},   // 65 - 128
-    {132, 164, 184, 196, 216, 228, 240, 128, 128},   // 129 - 256
-    {130, 162, 178, 194, 212, 226, 240, 248, 128},   // 257 - 512
-    {128, 160, 176, 192, 208, 224, 240, 248, 254},   // 513 - 1024
-  }, {
-    {160, 128, 128, 128, 128, 128, 128, 128, 128},   // 3 - 4
-    {152, 176, 128, 128, 128, 128, 128, 128, 128},   // 5 - 8
-    {150, 184, 208, 128, 128, 128, 128, 128, 128},   // 9 - 16
-    {144, 176, 200, 216, 128, 128, 128, 128, 128},   // 17 - 32
-    {140, 172, 192, 208, 224, 128, 128, 128, 128},   // 33 - 64
-    {136, 168, 188, 200, 220, 232, 128, 128, 128},   // 65 - 128
-    {132, 164, 184, 196, 216, 228, 240, 128, 128},   // 129 - 256
-    {130, 162, 178, 194, 212, 226, 240, 248, 128},   // 257 - 512
-    {128, 160, 176, 192, 208, 224, 240, 248, 254},   // 513 - 1024
-  },
-};
-
-#endif  // CONFIG_CODE_NONZEROCOUNT
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -8,11 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-#include <stdio.h>
-
 #include "vp9/common/vp9_entropy.h"
-#include "string.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_entropymode.h"
@@ -20,8 +16,6 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_coefupdateprobs.h"
 
-const int vp9_i8x8_block[4] = {0, 2, 8, 10};
-
 DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = {
   0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
@@ -41,22 +35,16 @@
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
-// Unified coefficient band structure used by all block sizes
-DECLARE_ALIGNED(16, const int, vp9_coef_bands8x8[64]) = {
-  0, 1, 2, 3, 4, 4, 5, 5,
-  1, 2, 3, 4, 4, 5, 5, 5,
-  2, 3, 4, 4, 5, 5, 5, 5,
-  3, 4, 4, 5, 5, 5, 5, 5,
-  4, 4, 5, 5, 5, 5, 5, 5,
-  4, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5,
-  5, 5, 5, 5, 5, 5, 5, 5
+DECLARE_ALIGNED(16, const uint8_t,
+                vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1]) = {
+  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+  4, 4, 4, 4, 4, 5
 };
-DECLARE_ALIGNED(16, const int, vp9_coef_bands4x4[16]) = {
-  0, 1, 2, 3,
-  1, 2, 3, 4,
-  2, 3, 4, 5,
-  3, 4, 5, 5
+
+DECLARE_ALIGNED(16, const uint8_t,
+                vp9_coefband_trans_4x4[MAXBAND_INDEX + 1]) = {
+  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
+  5, 5, 5, 5, 5, 5
 };
 
 DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = {
@@ -63,8 +51,7 @@
   0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
 };
 
-#if CONFIG_SCATTERSCAN
-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_4x4[16]) = {
+DECLARE_ALIGNED(16, const int, vp9_default_scan_4x4[16]) = {
   0,  4,  1,  5,
   8,  2, 12,  9,
   3,  6, 13, 10,
@@ -85,7 +72,7 @@
   13, 11, 14, 15,
 };
 
-DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]) = {
+DECLARE_ALIGNED(64, const int, vp9_default_scan_8x8[64]) = {
   0,  8,  1, 16,  9,  2, 17, 24,
   10,  3, 18, 25, 32, 11,  4, 26,
   33, 19, 40, 12, 34, 27,  5, 41,
@@ -118,7 +105,7 @@
   60, 39, 61, 47, 54, 55, 62, 63,
 };
 
-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = {
+DECLARE_ALIGNED(16, const int, vp9_default_scan_16x16[256]) = {
   0,  16,   1,  32,  17,   2,  48,  33,  18,   3,  64,  34,  49,  19,  65,  80,
   50,   4,  35,  66,  20,  81,  96,  51,   5,  36,  82,  97,  67, 112,  21,  52,
   98,  37,  83, 113,   6,  68, 128,  53,  22,  99, 114,  84,   7, 129,  38,  69,
@@ -175,218 +162,64 @@
   190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, 255,
 };
 
-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {
+DECLARE_ALIGNED(16, const int, vp9_default_scan_32x32[1024]) = {
   0,   32,    1,   64,   33,    2,   96,   65,   34,  128,    3,   97,   66,  160,  129,   35,   98,    4,   67,  130,  161,  192,   36,   99,  224,    5,  162,  193,   68,  131,   37,  100,
   225,  194,  256,  163,   69,  132,    6,  226,  257,  288,  195,  101,  164,   38,  258,    7,  227,  289,  133,  320,   70,  196,  165,  290,  259,  228,   39,  321,  102,  352,    8,  197,
   71,  134,  322,  291,  260,  353,  384,  229,  166,  103,   40,  354,  323,  292,  135,  385,  198,  261,   72,    9,  416,  167,  386,  355,  230,  324,  104,  293,   41,  417,  199,  136,
   262,  387,  448,  325,  356,   10,   73,  418,  231,  168,  449,  294,  388,  105,  419,  263,   42,  200,  357,  450,  137,  480,   74,  326,  232,   11,  389,  169,  295,  420,  106,  451,
   481,  358,  264,  327,  201,   43,  138,  512,  482,  390,  296,  233,  170,  421,   75,  452,  359,   12,  513,  265,  483,  328,  107,  202,  514,  544,  422,  391,  453,  139,   44,  234,
-  484,  297,  360,  171,   76,  515,  545,  266,  329,  454,   13,  423,  392,  203,  108,  546,  485,  576,  298,  235,  140,  361,  516,  330,  172,  547,   45,  424,  455,  267,  393,  577,
-  486,   77,  204,  517,  362,  548,  608,   14,  456,  299,  578,  109,  236,  425,  394,  487,  609,  331,  141,  579,  518,   46,  268,   15,  173,  549,  610,  640,  363,   78,  519,  488,
-  300,  205,   16,  457,  580,  426,  550,  395,  110,  237,  611,  641,  332,  672,  142,  642,  269,  458,   47,  581,  427,  489,  174,  364,  520,  612,  551,  673,   79,  206,  301,  643,
-  704,   17,  111,  490,  674,  238,  582,   48,  521,  613,  333,  396,  459,  143,  270,  552,  644,  705,  736,  365,   80,  675,  583,  175,  428,  706,  112,  302,  207,  614,  553,   49,
-  645,  522,  737,  397,  768,  144,  334,   18,  676,  491,  239,  615,  707,  584,   81,  460,  176,  271,  738,  429,  113,  800,  366,  208,  523,  708,  646,  554,  677,  769,   19,  145,
-  585,  739,  240,  303,   50,  461,  616,  398,  647,  335,  492,  177,   82,  770,  832,  555,  272,  430,  678,  209,  709,  114,  740,  801,  617,   51,  304,  679,  524,  367,  586,  241,
-  20,  146,  771,  864,   83,  802,  648,  493,  399,  273,  336,  710,  178,  462,  833,  587,  741,  115,  305,  711,  368,  525,  618,  803,  210,  896,  680,  834,  772,   52,  649,  147,
-  431,  494,  556,  242,  400,  865,  337,   21,  928,  179,  742,   84,  463,  274,  369,  804,  650,  557,  743,  960,  835,  619,  773,  306,  211,  526,  432,  992,  588,  712,  116,  243,
-  866,  495,  681,  558,  805,  589,  401,  897,   53,  338,  148,  682,  867,  464,  275,   22,  370,  433,  307,  620,  527,  836,  774,  651,  713,  744,   85,  180,  621,  465,  929,  775,
-  496,  898,  212,  339,  244,  402,  590,  117,  559,  714,  434,   23,  868,  930,  806,  683,  528,  652,  371,  961,  149,  837,   54,  899,  745,  276,  993,  497,  403,  622,  181,  776,
-  746,  529,  560,  435,   86,  684,  466,  308,  591,  653,  715,  807,  340,  869,  213,  962,  245,  838,  561,  931,  808,  592,  118,  498,  372,  623,  685,  994,  467,  654,  747,  900,
-  716,  277,  150,   55,   24,  404,  530,  839,  777,  655,  182,  963,  840,  686,  778,  309,  870,  341,   87,  499,  809,  624,  593,  436,  717,  932,  214,  246,  995,  718,  625,  373,
-  562,   25,  119,  901,  531,  468,  964,  748,  810,  278,  779,  500,  563,  656,  405,  687,  871,  872,  594,  151,  933,  749,  841,  310,  657,  626,  595,  437,  688,  183,  996,  965,
-  902,  811,  342,  750,  689,  719,  532,   56,  215,  469,  934,  374,  247,  720,  780,  564,  781,  842,  406,   26,  751,  903,  873,   57,  279,  627,  501,  658,  843,  997,  812,  904,
-  88,  813,  438,  752,  935,  936,  311,  596,  533,  690,  343,  966,  874,   89,  120,  470,  721,  875,  659,  782,  565,  998,  375,  844,  845,   27,  628,  967,  121,  905,  968,  152,
-  937,  814,  753,  502,  691,  783,  184,  153,  722,  407,   58,  815,  999,  660,  597,  723,  534,  906,  216,  439,  907,  248,  185,  876,  846,  692,  784,  629,   90,  969,  280,  754,
-  938,  939,  217,  847,  566,  471,  785,  816,  877, 1000,  249,  878,  661,  503,  312,  970,  755,  122,  817,  281,  344,  786,  598,  724,   28,   59,   29,  154,  535,  630,  376, 1001,
-  313,  908,  186,   91,  848,  849,  345,  909,  940,  879,  408,  818,  693, 1002,  971,  941,  567,  377,  218,  756,  910,  787,  440,  123,  880,  725,  662,  250,  819, 1003,  282,  972,
-  850,  599,  472,  409,  155,  441,  942,  757,  788,  694,  911,  881,  314,  631,  973,  504,  187, 1004,  346,  473,  851,  943,  820,  726,   60,  505,  219,  378,  912,  974,   30,   31,
-  536,  882, 1005,   92,  251,  663,  944,  913,  283,  695,  883,  568, 1006,  975,  410,  442,  945,  789,  852,  537, 1007,  124,  315,   61,  758,  821,  600,  914,  976,  569,  474,  347,
-  156, 1008,  915,   93,  977,  506,  946,  727,  379,  884,  188,  632,  601, 1009,  790,  853,  978,  947,  220,  411,  125,  633,  664,  759,  252,  443,  916,  538,  157,  822,   62,  570,
-  979,  284, 1010,  885,  948,  189,  475,   94,  316,  665,  696, 1011,  854,  791,  980,  221,  348,   63,  917,  602,  380,  507,  253,  126,  697,  823,  634,  285,  728,  949,  886,   95,
-  158,  539, 1012,  317,  412,  444,  760,  571,  190,  981,  729,  918,  127,  666,  349,  381,  476,  855,  761, 1013,  603,  222,  159,  698,  950,  508,  254,  792,  286,  635,  887,  793,
-  413,  191,  982,  445,  540,  318,  730,  667,  223,  824,  919, 1014,  350,  477,  572,  255,  825,  951,  762,  509,  604,  856,  382,  699,  287,  319,  636,  983,  794,  414,  541,  731,
-  857,  888,  351,  446,  573, 1015,  668,  889,  478,  826,  383,  763,  605,  920,  510,  637,  415,  700,  921,  858,  447,  952,  542,  795,  479,  953,  732,  890,  669,  574,  511,  984,
-  827,  985,  922, 1016,  764,  606,  543,  701,  859,  638, 1017,  575,  796,  954,  733,  891,  670,  607,  828,  986,  765,  923,  639, 1018,  702,  860,  955,  671,  892,  734,  797,  703,
-  987,  829, 1019,  766,  924,  735,  861,  956,  988,  893,  767,  798,  830, 1020,  925,  957,  799,  862,  831,  989,  894, 1021,  863,  926,  895,  958,  990, 1022,  927,  959,  991, 1023,
+  484,  297,  360,  171,   76,  515,  545,  266,  329,  454,   13,  423,  203,  108,  546,  485,  576,  298,  235,  140,  361,  330,  172,  547,   45,  455,  267,  577,  486,   77,  204,  362,
+  608,   14,  299,  578,  109,  236,  487,  609,  331,  141,  579,   46,   15,  173,  610,  363,   78,  205,   16,  110,  237,  611,  142,   47,  174,   79,  206,   17,  111,  238,   48,  143,
+  80,  175,  112,  207,   49,   18,  239,   81,  113,   19,   50,   82,  114,   51,   83,  115,  640,  516,  392,  268,  144,   20,  672,  641,  548,  517,  424,  393,  300,  269,  176,  145,
+  52,   21,  704,  673,  642,  580,  549,  518,  456,  425,  394,  332,  301,  270,  208,  177,  146,   84,   53,   22,  736,  705,  674,  643,  612,  581,  550,  519,  488,  457,  426,  395,
+  364,  333,  302,  271,  240,  209,  178,  147,  116,   85,   54,   23,  737,  706,  675,  613,  582,  551,  489,  458,  427,  365,  334,  303,  241,  210,  179,  117,   86,   55,  738,  707,
+  614,  583,  490,  459,  366,  335,  242,  211,  118,   87,  739,  615,  491,  367,  243,  119,  768,  644,  520,  396,  272,  148,   24,  800,  769,  676,  645,  552,  521,  428,  397,  304,
+  273,  180,  149,   56,   25,  832,  801,  770,  708,  677,  646,  584,  553,  522,  460,  429,  398,  336,  305,  274,  212,  181,  150,   88,   57,   26,  864,  833,  802,  771,  740,  709,
+  678,  647,  616,  585,  554,  523,  492,  461,  430,  399,  368,  337,  306,  275,  244,  213,  182,  151,  120,   89,   58,   27,  865,  834,  803,  741,  710,  679,  617,  586,  555,  493,
+  462,  431,  369,  338,  307,  245,  214,  183,  121,   90,   59,  866,  835,  742,  711,  618,  587,  494,  463,  370,  339,  246,  215,  122,   91,  867,  743,  619,  495,  371,  247,  123,
+  896,  772,  648,  524,  400,  276,  152,   28,  928,  897,  804,  773,  680,  649,  556,  525,  432,  401,  308,  277,  184,  153,   60,   29,  960,  929,  898,  836,  805,  774,  712,  681,
+  650,  588,  557,  526,  464,  433,  402,  340,  309,  278,  216,  185,  154,   92,   61,   30,  992,  961,  930,  899,  868,  837,  806,  775,  744,  713,  682,  651,  620,  589,  558,  527,
+  496,  465,  434,  403,  372,  341,  310,  279,  248,  217,  186,  155,  124,   93,   62,   31,  993,  962,  931,  869,  838,  807,  745,  714,  683,  621,  590,  559,  497,  466,  435,  373,
+  342,  311,  249,  218,  187,  125,   94,   63,  994,  963,  870,  839,  746,  715,  622,  591,  498,  467,  374,  343,  250,  219,  126,   95,  995,  871,  747,  623,  499,  375,  251,  127,
+  900,  776,  652,  528,  404,  280,  156,  932,  901,  808,  777,  684,  653,  560,  529,  436,  405,  312,  281,  188,  157,  964,  933,  902,  840,  809,  778,  716,  685,  654,  592,  561,
+  530,  468,  437,  406,  344,  313,  282,  220,  189,  158,  996,  965,  934,  903,  872,  841,  810,  779,  748,  717,  686,  655,  624,  593,  562,  531,  500,  469,  438,  407,  376,  345,
+  314,  283,  252,  221,  190,  159,  997,  966,  935,  873,  842,  811,  749,  718,  687,  625,  594,  563,  501,  470,  439,  377,  346,  315,  253,  222,  191,  998,  967,  874,  843,  750,
+  719,  626,  595,  502,  471,  378,  347,  254,  223,  999,  875,  751,  627,  503,  379,  255,  904,  780,  656,  532,  408,  284,  936,  905,  812,  781,  688,  657,  564,  533,  440,  409,
+  316,  285,  968,  937,  906,  844,  813,  782,  720,  689,  658,  596,  565,  534,  472,  441,  410,  348,  317,  286, 1000,  969,  938,  907,  876,  845,  814,  783,  752,  721,  690,  659,
+  628,  597,  566,  535,  504,  473,  442,  411,  380,  349,  318,  287, 1001,  970,  939,  877,  846,  815,  753,  722,  691,  629,  598,  567,  505,  474,  443,  381,  350,  319, 1002,  971,
+  878,  847,  754,  723,  630,  599,  506,  475,  382,  351, 1003,  879,  755,  631,  507,  383,  908,  784,  660,  536,  412,  940,  909,  816,  785,  692,  661,  568,  537,  444,  413,  972,
+  941,  910,  848,  817,  786,  724,  693,  662,  600,  569,  538,  476,  445,  414, 1004,  973,  942,  911,  880,  849,  818,  787,  756,  725,  694,  663,  632,  601,  570,  539,  508,  477,
+  446,  415, 1005,  974,  943,  881,  850,  819,  757,  726,  695,  633,  602,  571,  509,  478,  447, 1006,  975,  882,  851,  758,  727,  634,  603,  510,  479, 1007,  883,  759,  635,  511,
+  912,  788,  664,  540,  944,  913,  820,  789,  696,  665,  572,  541,  976,  945,  914,  852,  821,  790,  728,  697,  666,  604,  573,  542, 1008,  977,  946,  915,  884,  853,  822,  791,
+  760,  729,  698,  667,  636,  605,  574,  543, 1009,  978,  947,  885,  854,  823,  761,  730,  699,  637,  606,  575, 1010,  979,  886,  855,  762,  731,  638,  607, 1011,  887,  763,  639,
+  916,  792,  668,  948,  917,  824,  793,  700,  669,  980,  949,  918,  856,  825,  794,  732,  701,  670, 1012,  981,  950,  919,  888,  857,  826,  795,  764,  733,  702,  671, 1013,  982,
+  951,  889,  858,  827,  765,  734,  703, 1014,  983,  890,  859,  766,  735, 1015,  891,  767,  920,  796,  952,  921,  828,  797,  984,  953,  922,  860,  829,  798, 1016,  985,  954,  923,
+  892,  861,  830,  799, 1017,  986,  955,  893,  862,  831, 1018,  987,  894,  863, 1019,  895,  924,  956,  925,  988,  957,  926, 1020,  989,  958,  927, 1021,  990,  959, 1022,  991, 1023,
 };
-#else  // CONFIG_SCATTERSCAN
-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_4x4[16]) = {
-  0,  1,  4,  8,
-  5,  2,  3,  6,
-  9, 12, 13, 10,
-  7, 11, 14, 15,
-};
 
-DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]) = {
-  0, 4,  8, 12,
-  1, 5,  9, 13,
-  2, 6, 10, 14,
-  3, 7, 11, 15
-};
-
-DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]) = {
-  0,   1,  2,  3,
-  4,   5,  6,  7,
-  8,   9, 10, 11,
-  12, 13, 14, 15
-};
-
-DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]) = {
-  0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,
-  12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,
-  35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
-  58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
-};
-
-DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]) = {
-   0,  8, 16, 24, 32, 40, 48, 56,
-   1,  9, 17, 25, 33, 41, 49, 57,
-   2, 10, 18, 26, 34, 42, 50, 58,
-   3, 11, 19, 27, 35, 43, 51, 59,
-   4, 12, 20, 28, 36, 44, 52, 60,
-   5, 13, 21, 29, 37, 45, 53, 61,
-   6, 14, 22, 30, 38, 46, 54, 62,
-   7, 15, 23, 31, 39, 47, 55, 63,
-};
-
-DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]) = {
-   0,  1,  2,  3,  4,  5,  6,  7,
-   8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23,
-  24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39,
-  40, 41, 42, 43, 44, 45, 46, 47,
-  48, 49, 50, 51, 52, 53, 54, 55,
-  56, 57, 58, 59, 60, 61, 62, 63,
-};
-
-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = {
-  0,   1,  16,  32,  17,   2,   3,  18,
-  33,  48,  64,  49,  34,  19,   4,   5,
-  20,  35,  50,  65,  80,  96,  81,  66,
-  51,  36,  21,   6,   7,  22,  37,  52,
-  67,  82,  97, 112, 128, 113,  98,  83,
-  68,  53,  38,  23,   8,   9,  24,  39,
-  54,  69,  84,  99, 114, 129, 144, 160,
-  145, 130, 115, 100,  85,  70,  55,  40,
-  25,  10,  11,  26,  41,  56,  71,  86,
-  101, 116, 131, 146, 161, 176, 192, 177,
-  162, 147, 132, 117, 102,  87,  72,  57,
-  42,  27,  12,  13,  28,  43,  58, 73,
-  88, 103, 118, 133, 148, 163, 178, 193,
-  208, 224, 209, 194, 179, 164, 149, 134,
-  119, 104,  89,  74,  59,  44,  29,  14,
-  15,  30, 45,  60,  75,  90, 105, 120,
-  135, 150, 165, 180, 195, 210, 225, 240,
-  241, 226, 211, 196, 181, 166, 151, 136,
-  121, 106,  91,  76,  61,  46,  31,  47,
-  62,  77, 92, 107, 122, 137, 152, 167,
-  182, 197, 212, 227, 242, 243, 228, 213,
-  198, 183, 168, 153, 138, 123, 108, 93,
-  78,  63,  79,  94, 109, 124, 139, 154,
-  169, 184, 199, 214, 229, 244, 245, 230,
-  215, 200, 185, 170, 155, 140, 125, 110,
-  95, 111, 126, 141, 156, 171, 186, 201,
-  216, 231, 246, 247, 232, 217, 202, 187,
-  172, 157, 142, 127, 143, 158, 173, 188,
-  203, 218, 233, 248, 249, 234, 219, 204,
-  189, 174, 159, 175, 190, 205, 220, 235,
-  250, 251, 236, 221, 206, 191, 207, 222,
-  237, 252, 253, 238, 223, 239, 254, 255,
-};
-
-DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]) = {
-    0,  16,  32,  48,  64,  80,  96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
-    1,  17,  33,  49,  65,  81,  97, 113, 129, 145, 161, 177, 193, 209, 225, 241,
-    2,  18,  34,  50,  66,  82,  98, 114, 130, 146, 162, 178, 194, 210, 226, 242,
-    3,  19,  35,  51,  67,  83,  99, 115, 131, 147, 163, 179, 195, 211, 227, 243,
-    4,  20,  36,  52,  68,  84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
-    5,  21,  37,  53,  69,  85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
-    6,  22,  38,  54,  70,  86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
-    7,  23,  39,  55,  71,  87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
-    8,  24,  40,  56,  72,  88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
-    9,  25,  41,  57,  73,  89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
-   10,  26,  42,  58,  74,  90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
-   11,  27,  43,  59,  75,  91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
-   12,  28,  44,  60,  76,  92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
-   13,  29,  45,  61,  77,  93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
-   14,  30,  46,  62,  78,  94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
-   15,  31,  47,  63,  79,  95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
-};
-
-DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]) = {
-    0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
-   16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
-   32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
-   48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
-   64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
-   80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
-   96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
-  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
-  128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
-  144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
-  160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
-  176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
-  192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
-  208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
-  224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
-  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
-};
-
-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {
-    0,    1,   32,   64,   33,    2,    3,   34,   65,   96,  128,   97,   66,   35,    4,    5,   36,   67,   98,  129,  160,  192,  161,  130,   99,   68,   37,    6,    7,   38,   69,  100,
-  131,  162,  193,  224,  256,  225,  194,  163,  132,  101,   70,   39,    8,    9,   40,   71,  102,  133,  164,  195,  226,  257,  288,  320,  289,  258,  227,  196,  165,  134,  103,   72,
-   41,   10,   11,   42,   73,  104,  135,  166,  197,  228,  259,  290,  321,  352,  384,  353,  322,  291,  260,  229,  198,  167,  136,  105,   74,   43,   12,   13,   44,   75,  106,  137,
-  168,  199,  230,  261,  292,  323,  354,  385,  416,  448,  417,  386,  355,  324,  293,  262,  231,  200,  169,  138,  107,   76,   45,   14,   15,   46,   77,  108,  139,  170,  201,  232,
-  263,  294,  325,  356,  387,  418,  449,  480,  512,  481,  450,  419,  388,  357,  326,  295,  264,  233,  202,  171,  140,  109,   78,   47,   16,   17,   48,   79,  110,  141,  172,  203,
-  234,  265,  296,  327,  358,  389,  420,  451,  482,  513,  544,  576,  545,  514,  483,  452,  421,  390,  359,  328,  297,  266,  235,  204,  173,  142,  111,   80,   49,   18,   19,   50,
-   81,  112,  143,  174,  205,  236,  267,  298,  329,  360,  391,  422,  453,  484,  515,  546,  577,  608,  640,  609,  578,  547,  516,  485,  454,  423,  392,  361,  330,  299,  268,  237,
-  206,  175,  144,  113,   82,   51,   20,   21,   52,   83,  114,  145,  176,  207,  238,  269,  300,  331,  362,  393,  424,  455,  486,  517,  548,  579,  610,  641,  672,  704,  673,  642,
-  611,  580,  549,  518,  487,  456,  425,  394,  363,  332,  301,  270,  239,  208,  177,  146,  115,   84,   53,   22,   23,   54,   85,  116,  147,  178,  209,  240,  271,  302,  333,  364,
-  395,  426,  457,  488,  519,  550,  581,  612,  643,  674,  705,  736,  768,  737,  706,  675,  644,  613,  582,  551,  520,  489,  458,  427,  396,  365,  334,  303,  272,  241,  210,  179,
-  148,  117,   86,   55,   24,   25,   56,   87,  118,  149,  180,  211,  242,  273,  304,  335,  366,  397,  428,  459,  490,  521,  552,  583,  614,  645,  676,  707,  738,  769,  800,  832,
-  801,  770,  739,  708,  677,  646,  615,  584,  553,  522,  491,  460,  429,  398,  367,  336,  305,  274,  243,  212,  181,  150,  119,   88,   57,   26,   27,   58,   89,  120,  151,  182,
-  213,  244,  275,  306,  337,  368,  399,  430,  461,  492,  523,  554,  585,  616,  647,  678,  709,  740,  771,  802,  833,  864,  896,  865,  834,  803,  772,  741,  710,  679,  648,  617,
-  586,  555,  524,  493,  462,  431,  400,  369,  338,  307,  276,  245,  214,  183,  152,  121,   90,   59,   28,   29,   60,   91,  122,  153,  184,  215,  246,  277,  308,  339,  370,  401,
-  432,  463,  494,  525,  556,  587,  618,  649,  680,  711,  742,  773,  804,  835,  866,  897,  928,  960,  929,  898,  867,  836,  805,  774,  743,  712,  681,  650,  619,  588,  557,  526,
-  495,  464,  433,  402,  371,  340,  309,  278,  247,  216,  185,  154,  123,   92,   61,   30,   31,   62,   93,  124,  155,  186,  217,  248,  279,  310,  341,  372,  403,  434,  465,  496,
-  527,  558,  589,  620,  651,  682,  713,  744,  775,  806,  837,  868,  899,  930,  961,  992,  993,  962,  931,  900,  869,  838,  807,  776,  745,  714,  683,  652,  621,  590,  559,  528,
-  497,  466,  435,  404,  373,  342,  311,  280,  249,  218,  187,  156,  125,   94,   63,   95,  126,  157,  188,  219,  250,  281,  312,  343,  374,  405,  436,  467,  498,  529,  560,  591,
-  622,  653,  684,  715,  746,  777,  808,  839,  870,  901,  932,  963,  994,  995,  964,  933,  902,  871,  840,  809,  778,  747,  716,  685,  654,  623,  592,  561,  530,  499,  468,  437,
-  406,  375,  344,  313,  282,  251,  220,  189,  158,  127,  159,  190,  221,  252,  283,  314,  345,  376,  407,  438,  469,  500,  531,  562,  593,  624,  655,  686,  717,  748,  779,  810,
-  841,  872,  903,  934,  965,  996,  997,  966,  935,  904,  873,  842,  811,  780,  749,  718,  687,  656,  625,  594,  563,  532,  501,  470,  439,  408,  377,  346,  315,  284,  253,  222,
-  191,  223,  254,  285,  316,  347,  378,  409,  440,  471,  502,  533,  564,  595,  626,  657,  688,  719,  750,  781,  812,  843,  874,  905,  936,  967,  998,  999,  968,  937,  906,  875,
-  844,  813,  782,  751,  720,  689,  658,  627,  596,  565,  534,  503,  472,  441,  410,  379,  348,  317,  286,  255,  287,  318,  349,  380,  411,  442,  473,  504,  535,  566,  597,  628,
-  659,  690,  721,  752,  783,  814,  845,  876,  907,  938,  969, 1000, 1001,  970,  939,  908,  877,  846,  815,  784,  753,  722,  691,  660,  629,  598,  567,  536,  505,  474,  443,  412,
-  381,  350,  319,  351,  382,  413,  444,  475,  506,  537,  568,  599,  630,  661,  692,  723,  754,  785,  816,  847,  878,  909,  940,  971, 1002, 1003,  972,  941,  910,  879,  848,  817,
-  786,  755,  724,  693,  662,  631,  600,  569,  538,  507,  476,  445,  414,  383,  415,  446,  477,  508,  539,  570,  601,  632,  663,  694,  725,  756,  787,  818,  849,  880,  911,  942,
-  973, 1004, 1005,  974,  943,  912,  881,  850,  819,  788,  757,  726,  695,  664,  633,  602,  571,  540,  509,  478,  447,  479,  510,  541,  572,  603,  634,  665,  696,  727,  758,  789,
-  820,  851,  882,  913,  944,  975, 1006, 1007,  976,  945,  914,  883,  852,  821,  790,  759,  728,  697,  666,  635,  604,  573,  542,  511,  543,  574,  605,  636,  667,  698,  729,  760,
-  791,  822,  853,  884,  915,  946,  977, 1008, 1009,  978,  947,  916,  885,  854,  823,  792,  761,  730,  699,  668,  637,  606,  575,  607,  638,  669,  700,  731,  762,  793,  824,  855,
-  886,  917,  948,  979, 1010, 1011,  980,  949,  918,  887,  856,  825,  794,  763,  732,  701,  670,  639,  671,  702,  733,  764,  795,  826,  857,  888,  919,  950,  981, 1012, 1013,  982,
-  951,  920,  889,  858,  827,  796,  765,  734,  703,  735,  766,  797,  828,  859,  890,  921,  952,  983, 1014, 1015,  984,  953,  922,  891,  860,  829,  798,  767,  799,  830,  861,  892,
-  923,  954,  985, 1016, 1017,  986,  955,  924,  893,  862,  831,  863,  894,  925,  956,  987, 1018, 1019,  988,  957,  926,  895,  927,  958,  989, 1020, 1021,  990,  959,  991, 1022, 1023,
-};
-#endif  // CONFIG_SCATTERSCAN
-
 /* Array indices are identical to previously-existing CONTEXT_NODE indices */
 
 const vp9_tree_index vp9_coef_tree[ 22] =     /* corresponding _CONTEXT_NODEs */
 {
-  -DCT_EOB_TOKEN, 2,                             /* 0 = EOB */
-  -ZERO_TOKEN, 4,                               /* 1 = ZERO */
-  -ONE_TOKEN, 6,                               /* 2 = ONE */
+#if CONFIG_BALANCED_COEFTREE
+  -ZERO_TOKEN, 2,                             /* 0 = ZERO */
+  -DCT_EOB_TOKEN, 4,                          /* 1 = EOB  */
+#else
+  -DCT_EOB_TOKEN, 2,                          /* 0 = EOB */
+  -ZERO_TOKEN, 4,                             /* 1 = ZERO */
+#endif
+  -ONE_TOKEN, 6,                              /* 2 = ONE */
   8, 12,                                      /* 3 = LOW_VAL */
   -TWO_TOKEN, 10,                            /* 4 = TWO */
   -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */
-  14, 16,                                    /* 6 = HIGH_LOW */
+  14, 16,                                   /* 6 = HIGH_LOW */
   -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */
   18, 20,                                   /* 8 = CAT_THREEFOUR */
-  -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,  /* 9 = CAT_THREE */
-  -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6   /* 10 = CAT_FIVE */
+  -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,   /* 9 = CAT_THREE */
+  -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6    /* 10 = CAT_FIVE */
 };
 
-struct vp9_token_struct vp9_coef_encodings[MAX_ENTROPY_TOKENS];
+struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
 
 /* Trees for extra bits.  Probabilities are constant and
    do not depend on previously encoded bits */
@@ -400,1660 +233,189 @@
   254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
 };
 
-#if CONFIG_CODE_NONZEROCOUNT
-const vp9_tree_index vp9_nzc4x4_tree[2 * NZC4X4_NODES] = {
-  -NZC_0, 2,
-  4, 6,
-  -NZC_1, -NZC_2,
-  -NZC_3TO4, 8,
-  -NZC_5TO8, -NZC_9TO16,
+const vp9_tree_index vp9_coefmodel_tree[6] = {
+#if CONFIG_BALANCED_COEFTREE
+  -ZERO_TOKEN, 2,
+  -DCT_EOB_MODEL_TOKEN, 4,
+#else
+  -DCT_EOB_MODEL_TOKEN, 2,                      /* 0 = EOB */
+  -ZERO_TOKEN, 4,                               /* 1 = ZERO */
+#endif
+  -ONE_TOKEN, -TWO_TOKEN,
 };
-struct vp9_token_struct vp9_nzc4x4_encodings[NZC4X4_TOKENS];
 
-const vp9_tree_index vp9_nzc8x8_tree[2 * NZC8X8_NODES] = {
-  -NZC_0, 2,
-  4, 6,
-  -NZC_1, -NZC_2,
-  8, 10,
-  -NZC_3TO4, -NZC_5TO8,
-  -NZC_9TO16, 12,
-  -NZC_17TO32, -NZC_33TO64,
-};
-struct vp9_token_struct vp9_nzc8x8_encodings[NZC8X8_TOKENS];
+// Model obtained from a 2-sided zero-centerd distribuition derived
+// from a Pareto distribution. The cdf of the distribution is:
+// cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
+//
+// For a given beta and a given probablity of the 1-node, the alpha
+// is first solved, and then the {alpha, beta} pair is used to generate
+// the probabilities for the rest of the nodes.
 
-const vp9_tree_index vp9_nzc16x16_tree[2 * NZC16X16_NODES] = {
-  -NZC_0, 2,
-  4, 6,
-  -NZC_1, -NZC_2,
-  8, 10,
-  -NZC_3TO4, -NZC_5TO8,
-  12, 14,
-  -NZC_9TO16, -NZC_17TO32,
-  -NZC_33TO64, 16,
-  -NZC_65TO128, -NZC_129TO256,
-};
-struct vp9_token_struct vp9_nzc16x16_encodings[NZC16X16_TOKENS];
-
-const vp9_tree_index vp9_nzc32x32_tree[2 * NZC32X32_NODES] = {
-  -NZC_0, 2,
-  4, 6,
-  -NZC_1, -NZC_2,
-  8, 10,
-  -NZC_3TO4, -NZC_5TO8,
-  12, 14,
-  -NZC_9TO16, -NZC_17TO32,
-  16, 18,
-  -NZC_33TO64, -NZC_65TO128,
-  -NZC_129TO256, 20,
-  -NZC_257TO512, -NZC_513TO1024,
-};
-struct vp9_token_struct vp9_nzc32x32_encodings[NZC32X32_TOKENS];
-
-const int vp9_extranzcbits[NZC32X32_TOKENS] = {
-  0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
-};
-
-const int vp9_basenzcvalue[NZC32X32_TOKENS] = {
-  0, 1, 2, 3, 5, 9, 17, 33, 65, 129, 257, 513
-};
-
-#endif  // CONFIG_CODE_NONZEROCOUNT
-
-#if CONFIG_MODELCOEFPROB
-
-const vp9_prob vp9_modelcoefprobs_gg875[COEFPROB_MODELS][ENTROPY_NODES - 1] = {
-  // Probs generated with a Generalized Gaussian (with shape parameter 0.875)
-  // source model with varying quantizer step size for a uniform quantizer
-  {0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},  // do not use
-  {1,   2,   6,  86, 129,  11,  87,  42,  92,  52,},
-  {2,   4,  12,  87, 129,  22,  89,  75,  97,  91,},
-  {3,   6,  17,  88, 130,  32,  90, 102, 102, 121,},
-  {4,   8,  22,  89, 131,  41,  91, 125, 107, 145,},
-  {5,  10,  28,  90, 131,  50,  93, 144, 112, 164,},
-  {6,  12,  33,  90, 132,  59,  94, 160, 117, 180,},
-  {7,  14,  38,  91, 132,  67,  95, 173, 122, 193,},
-  {8,  15,  42,  92, 133,  75,  97, 185, 126, 204,},
-  {9,  17,  47,  92, 133,  82,  98, 195, 131, 212,},
-  {10,  19,  52,  93, 134,  89,  99, 203, 135, 220,},
-  {11,  21,  56,  94, 134,  96, 101, 211, 140, 226,},
-  {12,  23,  60,  95, 135, 102, 102, 217, 144, 231,},
-  {13,  25,  65,  95, 135, 109, 103, 222, 148, 235,},
-  {14,  26,  69,  96, 136, 115, 105, 227, 153, 238,},
-  {15,  28,  73,  97, 136, 120, 106, 231, 157, 241,},
-  {16,  30,  77,  97, 137, 126, 107, 234, 161, 244,},
-  {17,  32,  81,  98, 138, 131, 108, 237, 164, 246,},
-  {18,  34,  85,  99, 138, 136, 110, 240, 168, 247,},
-  {19,  35,  89, 100, 139, 141, 111, 242, 172, 249,},
-  {20,  37,  92, 100, 139, 145, 112, 244, 175, 250,},
-  {21,  39,  96, 101, 140, 150, 113, 246, 179, 251,},
-  {22,  41,  99, 102, 140, 154, 115, 247, 182, 252,},
-  {23,  42, 103, 102, 141, 158, 116, 248, 185, 252,},
-  {24,  44, 106, 103, 141, 162, 117, 249, 188, 253,},
-  {25,  46, 110, 104, 142, 166, 118, 250, 191, 253,},
-  {26,  48, 113, 104, 142, 170, 120, 251, 194, 254,},
-  {27,  49, 116, 105, 143, 173, 121, 252, 197, 254,},
-  {28,  51, 119, 106, 143, 176, 122, 252, 200, 254,},
-  {29,  53, 122, 107, 144, 180, 123, 253, 202, 255,},
-  {30,  54, 125, 107, 144, 183, 125, 253, 205, 255,},
-  {31,  56, 128, 108, 145, 186, 126, 254, 207, 255,},
-  {32,  58, 131, 109, 145, 189, 127, 254, 209, 255,},
-  {33,  59, 134, 109, 146, 191, 128, 254, 212, 255,},
-  {34,  61, 137, 110, 146, 194, 130, 254, 214, 255,},
-  {35,  62, 139, 111, 147, 196, 131, 255, 216, 255,},
-  {36,  64, 142, 112, 147, 199, 132, 255, 218, 255,},
-  {37,  66, 145, 112, 148, 201, 134, 255, 220, 255,},
-  {38,  67, 147, 113, 148, 203, 135, 255, 221, 255,},
-  {39,  69, 150, 114, 149, 206, 136, 255, 223, 255,},
-  {40,  70, 152, 114, 149, 208, 137, 255, 225, 255,},
-  {41,  72, 155, 115, 150, 210, 138, 255, 226, 255,},
-  {42,  74, 157, 116, 150, 212, 140, 255, 228, 255,},
-  {43,  75, 159, 117, 151, 213, 141, 255, 229, 255,},
-  {44,  77, 161, 117, 151, 215, 142, 255, 230, 255,},
-  {45,  78, 164, 118, 152, 217, 143, 255, 232, 255,},
-  {46,  80, 166, 119, 152, 219, 145, 255, 233, 255,},
-  {47,  81, 168, 120, 153, 220, 146, 255, 234, 255,},
-  {48,  83, 170, 120, 153, 222, 147, 255, 235, 255,},
-  {49,  84, 172, 121, 154, 223, 148, 255, 236, 255,},
-  {50,  86, 174, 122, 154, 225, 150, 255, 237, 255,},
-  {51,  87, 176, 123, 155, 226, 151, 255, 238, 255,},
-  {52,  89, 178, 123, 155, 227, 152, 255, 239, 255,},
-  {53,  90, 180, 124, 156, 228, 153, 255, 240, 255,},
-  {54,  92, 182, 125, 156, 230, 154, 255, 241, 255,},
-  {55,  93, 183, 126, 157, 231, 156, 255, 242, 255,},
-  {56,  95, 185, 126, 157, 232, 157, 255, 242, 255,},
-  {57,  96, 187, 127, 158, 233, 158, 255, 243, 255,},
-  {58,  98, 189, 128, 158, 234, 159, 255, 244, 255,},
-  {59,  99, 190, 129, 159, 235, 160, 255, 244, 255,},
-  {60, 101, 192, 129, 159, 236, 162, 255, 245, 255,},
-  {61, 102, 193, 130, 160, 237, 163, 255, 246, 255,},
-  {62, 104, 195, 131, 160, 238, 164, 255, 246, 255,},
-  {63, 105, 197, 132, 161, 238, 165, 255, 247, 255,},
-  {64, 106, 198, 132, 162, 239, 166, 255, 247, 255,},
-  {65, 108, 199, 133, 162, 240, 167, 255, 248, 255,},
-  {66, 109, 201, 134, 163, 241, 169, 255, 248, 255,},
-  {67, 111, 202, 135, 163, 241, 170, 255, 249, 255,},
-  {68, 112, 204, 135, 164, 242, 171, 255, 249, 255,},
-  {69, 113, 205, 136, 164, 243, 172, 255, 249, 255,},
-  {70, 115, 206, 137, 165, 243, 173, 255, 250, 255,},
-  {71, 116, 208, 138, 165, 244, 174, 255, 250, 255,},
-  {72, 117, 209, 138, 166, 244, 175, 255, 250, 255,},
-  {73, 119, 210, 139, 166, 245, 177, 255, 251, 255,},
-  {74, 120, 211, 140, 167, 245, 178, 255, 251, 255,},
-  {75, 121, 212, 141, 167, 246, 179, 255, 251, 255,},
-  {76, 123, 214, 142, 168, 246, 180, 255, 252, 255,},
-  {77, 124, 215, 142, 168, 247, 181, 255, 252, 255,},
-  {78, 125, 216, 143, 169, 247, 182, 255, 252, 255,},
-  {79, 127, 217, 144, 170, 248, 183, 255, 252, 255,},
-  {80, 128, 218, 145, 170, 248, 184, 255, 253, 255,},
-  {81, 129, 219, 146, 171, 248, 185, 255, 253, 255,},
-  {82, 131, 220, 146, 171, 249, 186, 255, 253, 255,},
-  {83, 132, 221, 147, 172, 249, 187, 255, 253, 255,},
-  {84, 133, 222, 148, 172, 249, 188, 255, 253, 255,},
-  {85, 134, 223, 149, 173, 250, 189, 255, 253, 255,},
-  {86, 136, 224, 149, 173, 250, 190, 255, 254, 255,},
-  {87, 137, 225, 150, 174, 250, 191, 255, 254, 255,},
-  {88, 138, 226, 151, 174, 251, 192, 255, 254, 255,},
-  {89, 139, 226, 152, 175, 251, 193, 255, 254, 255,},
-  {90, 141, 227, 153, 175, 251, 194, 255, 254, 255,},
-  {91, 142, 228, 153, 176, 251, 195, 255, 254, 255,},
-  {92, 143, 229, 154, 177, 252, 196, 255, 254, 255,},
-  {93, 144, 230, 155, 177, 252, 197, 255, 254, 255,},
-  {94, 146, 230, 156, 178, 252, 198, 255, 255, 255,},
-  {95, 147, 231, 157, 178, 252, 199, 255, 255, 255,},
-  {96, 148, 232, 157, 179, 252, 200, 255, 255, 255,},
-  {97, 149, 233, 158, 179, 253, 201, 255, 255, 255,},
-  {98, 150, 233, 159, 180, 253, 202, 255, 255, 255,},
-  {99, 152, 234, 160, 180, 253, 203, 255, 255, 255,},
-  {100, 153, 235, 161, 181, 253, 204, 255, 255, 255,},
-  {101, 154, 235, 161, 182, 253, 205, 255, 255, 255,},
-  {102, 155, 236, 162, 182, 253, 206, 255, 255, 255,},
-  {103, 156, 236, 163, 183, 254, 207, 255, 255, 255,},
-  {104, 157, 237, 164, 183, 254, 207, 255, 255, 255,},
-  {105, 159, 238, 165, 184, 254, 208, 255, 255, 255,},
-  {106, 160, 238, 166, 184, 254, 209, 255, 255, 255,},
-  {107, 161, 239, 166, 185, 254, 210, 255, 255, 255,},
-  {108, 162, 239, 167, 185, 254, 211, 255, 255, 255,},
-  {109, 163, 240, 168, 186, 254, 212, 255, 255, 255,},
-  {110, 164, 240, 169, 187, 254, 212, 255, 255, 255,},
-  {111, 165, 241, 170, 187, 254, 213, 255, 255, 255,},
-  {112, 166, 241, 170, 188, 255, 214, 255, 255, 255,},
-  {113, 167, 242, 171, 188, 255, 215, 255, 255, 255,},
-  {114, 169, 242, 172, 189, 255, 216, 255, 255, 255,},
-  {115, 170, 243, 173, 189, 255, 216, 255, 255, 255,},
-  {116, 171, 243, 174, 190, 255, 217, 255, 255, 255,},
-  {117, 172, 244, 174, 190, 255, 218, 255, 255, 255,},
-  {118, 173, 244, 175, 191, 255, 219, 255, 255, 255,},
-  {119, 174, 244, 176, 192, 255, 219, 255, 255, 255,},
-  {120, 175, 245, 177, 192, 255, 220, 255, 255, 255,},
-  {121, 176, 245, 178, 193, 255, 221, 255, 255, 255,},
-  {122, 177, 245, 178, 193, 255, 222, 255, 255, 255,},
-  {123, 178, 246, 179, 194, 255, 222, 255, 255, 255,},
-  {124, 179, 246, 180, 194, 255, 223, 255, 255, 255,},
-  {125, 180, 247, 181, 195, 255, 224, 255, 255, 255,},
-  {126, 181, 247, 182, 196, 255, 224, 255, 255, 255,},
-  {127, 182, 247, 182, 196, 255, 225, 255, 255, 255,},
-  {128, 183, 247, 183, 197, 255, 226, 255, 255, 255,},
-  {129, 184, 248, 184, 197, 255, 226, 255, 255, 255,},
-  {130, 185, 248, 185, 198, 255, 227, 255, 255, 255,},
-  {131, 186, 248, 186, 198, 255, 228, 255, 255, 255,},
-  {132, 187, 249, 186, 199, 255, 228, 255, 255, 255,},
-  {133, 188, 249, 187, 200, 255, 229, 255, 255, 255,},
-  {134, 189, 249, 188, 200, 255, 230, 255, 255, 255,},
-  {135, 190, 249, 189, 201, 255, 230, 255, 255, 255,},
-  {136, 191, 250, 190, 201, 255, 231, 255, 255, 255,},
-  {137, 192, 250, 190, 202, 255, 231, 255, 255, 255,},
-  {138, 193, 250, 191, 202, 255, 232, 255, 255, 255,},
-  {139, 194, 250, 192, 203, 255, 232, 255, 255, 255,},
-  {140, 195, 251, 193, 204, 255, 233, 255, 255, 255,},
-  {141, 195, 251, 194, 204, 255, 234, 255, 255, 255,},
-  {142, 196, 251, 194, 205, 255, 234, 255, 255, 255,},
-  {143, 197, 251, 195, 205, 255, 235, 255, 255, 255,},
-  {144, 198, 251, 196, 206, 255, 235, 255, 255, 255,},
-  {145, 199, 252, 197, 206, 255, 236, 255, 255, 255,},
-  {146, 200, 252, 197, 207, 255, 236, 255, 255, 255,},
-  {147, 201, 252, 198, 208, 255, 237, 255, 255, 255,},
-  {148, 202, 252, 199, 208, 255, 237, 255, 255, 255,},
-  {149, 203, 252, 200, 209, 255, 238, 255, 255, 255,},
-  {150, 203, 252, 201, 209, 255, 238, 255, 255, 255,},
-  {151, 204, 253, 201, 210, 255, 239, 255, 255, 255,},
-  {152, 205, 253, 202, 210, 255, 239, 255, 255, 255,},
-  {153, 206, 253, 203, 211, 255, 239, 255, 255, 255,},
-  {154, 207, 253, 204, 212, 255, 240, 255, 255, 255,},
-  {155, 208, 253, 204, 212, 255, 240, 255, 255, 255,},
-  {156, 209, 253, 205, 213, 255, 241, 255, 255, 255,},
-  {157, 209, 253, 206, 213, 255, 241, 255, 255, 255,},
-  {158, 210, 254, 207, 214, 255, 242, 255, 255, 255,},
-  {159, 211, 254, 207, 214, 255, 242, 255, 255, 255,},
-  {160, 212, 254, 208, 215, 255, 242, 255, 255, 255,},
-  {161, 213, 254, 209, 215, 255, 243, 255, 255, 255,},
-  {162, 213, 254, 210, 216, 255, 243, 255, 255, 255,},
-  {163, 214, 254, 210, 217, 255, 244, 255, 255, 255,},
-  {164, 215, 254, 211, 217, 255, 244, 255, 255, 255,},
-  {165, 216, 254, 212, 218, 255, 244, 255, 255, 255,},
-  {166, 216, 254, 212, 218, 255, 245, 255, 255, 255,},
-  {167, 217, 254, 213, 219, 255, 245, 255, 255, 255,},
-  {168, 218, 254, 214, 219, 255, 245, 255, 255, 255,},
-  {169, 219, 255, 215, 220, 255, 246, 255, 255, 255,},
-  {170, 219, 255, 215, 221, 255, 246, 255, 255, 255,},
-  {171, 220, 255, 216, 221, 255, 246, 255, 255, 255,},
-  {172, 221, 255, 217, 222, 255, 247, 255, 255, 255,},
-  {173, 222, 255, 217, 222, 255, 247, 255, 255, 255,},
-  {174, 222, 255, 218, 223, 255, 247, 255, 255, 255,},
-  {175, 223, 255, 219, 223, 255, 248, 255, 255, 255,},
-  {176, 224, 255, 220, 224, 255, 248, 255, 255, 255,},
-  {177, 224, 255, 220, 224, 255, 248, 255, 255, 255,},
-  {178, 225, 255, 221, 225, 255, 248, 255, 255, 255,},
-  {179, 226, 255, 222, 225, 255, 249, 255, 255, 255,},
-  {180, 226, 255, 222, 226, 255, 249, 255, 255, 255,},
-  {181, 227, 255, 223, 227, 255, 249, 255, 255, 255,},
-  {182, 228, 255, 224, 227, 255, 249, 255, 255, 255,},
-  {183, 228, 255, 224, 228, 255, 250, 255, 255, 255,},
-  {184, 229, 255, 225, 228, 255, 250, 255, 255, 255,},
-  {185, 230, 255, 226, 229, 255, 250, 255, 255, 255,},
-  {186, 230, 255, 226, 229, 255, 250, 255, 255, 255,},
-  {187, 231, 255, 227, 230, 255, 251, 255, 255, 255,},
-  {188, 232, 255, 228, 230, 255, 251, 255, 255, 255,},
-  {189, 232, 255, 228, 231, 255, 251, 255, 255, 255,},
-  {190, 233, 255, 229, 231, 255, 251, 255, 255, 255,},
-  {191, 233, 255, 229, 232, 255, 251, 255, 255, 255,},
-  {192, 234, 255, 230, 232, 255, 252, 255, 255, 255,},
-  {193, 234, 255, 231, 233, 255, 252, 255, 255, 255,},
-  {194, 235, 255, 231, 233, 255, 252, 255, 255, 255,},
-  {195, 236, 255, 232, 234, 255, 252, 255, 255, 255,},
-  {196, 236, 255, 232, 234, 255, 252, 255, 255, 255,},
-  {197, 237, 255, 233, 235, 255, 252, 255, 255, 255,},
-  {198, 237, 255, 234, 235, 255, 253, 255, 255, 255,},
-  {199, 238, 255, 234, 236, 255, 253, 255, 255, 255,},
-  {200, 238, 255, 235, 236, 255, 253, 255, 255, 255,},
-  {201, 239, 255, 235, 237, 255, 253, 255, 255, 255,},
-  {202, 239, 255, 236, 237, 255, 253, 255, 255, 255,},
-  {203, 240, 255, 237, 238, 255, 253, 255, 255, 255,},
-  {204, 240, 255, 237, 238, 255, 254, 255, 255, 255,},
-  {205, 241, 255, 238, 239, 255, 254, 255, 255, 255,},
-  {206, 241, 255, 238, 239, 255, 254, 255, 255, 255,},
-  {207, 242, 255, 239, 240, 255, 254, 255, 255, 255,},
-  {208, 242, 255, 239, 240, 255, 254, 255, 255, 255,},
-  {209, 243, 255, 240, 241, 255, 254, 255, 255, 255,},
-  {210, 243, 255, 240, 241, 255, 254, 255, 255, 255,},
-  {211, 244, 255, 241, 242, 255, 254, 255, 255, 255,},
-  {212, 244, 255, 241, 242, 255, 254, 255, 255, 255,},
-  {213, 245, 255, 242, 243, 255, 255, 255, 255, 255,},
-  {214, 245, 255, 242, 243, 255, 255, 255, 255, 255,},
-  {215, 246, 255, 243, 244, 255, 255, 255, 255, 255,},
-  {216, 246, 255, 243, 244, 255, 255, 255, 255, 255,},
-  {217, 246, 255, 244, 244, 255, 255, 255, 255, 255,},
-  {218, 247, 255, 244, 245, 255, 255, 255, 255, 255,},
-  {219, 247, 255, 245, 245, 255, 255, 255, 255, 255,},
-  {220, 248, 255, 245, 246, 255, 255, 255, 255, 255,},
-  {221, 248, 255, 246, 246, 255, 255, 255, 255, 255,},
-  {222, 248, 255, 246, 247, 255, 255, 255, 255, 255,},
-  {223, 249, 255, 247, 247, 255, 255, 255, 255, 255,},
-  {224, 249, 255, 247, 247, 255, 255, 255, 255, 255,},
-  {225, 250, 255, 247, 248, 255, 255, 255, 255, 255,},
-  {226, 250, 255, 248, 248, 255, 255, 255, 255, 255,},
-  {227, 250, 255, 248, 249, 255, 255, 255, 255, 255,},
-  {228, 251, 255, 249, 249, 255, 255, 255, 255, 255,},
-  {229, 251, 255, 249, 249, 255, 255, 255, 255, 255,},
-  {230, 251, 255, 249, 250, 255, 255, 255, 255, 255,},
-  {231, 251, 255, 250, 250, 255, 255, 255, 255, 255,},
-  {232, 252, 255, 250, 250, 255, 255, 255, 255, 255,},
-  {233, 252, 255, 251, 251, 255, 255, 255, 255, 255,},
-  {234, 252, 255, 251, 251, 255, 255, 255, 255, 255,},
-  {235, 253, 255, 251, 251, 255, 255, 255, 255, 255,},
-  {236, 253, 255, 252, 252, 255, 255, 255, 255, 255,},
-  {237, 253, 255, 252, 252, 255, 255, 255, 255, 255,},
-  {238, 253, 255, 252, 252, 255, 255, 255, 255, 255,},
-  {239, 254, 255, 253, 253, 255, 255, 255, 255, 255,},
-  {240, 254, 255, 253, 253, 255, 255, 255, 255, 255,},
-  {241, 254, 255, 253, 253, 255, 255, 255, 255, 255,},
-  {242, 254, 255, 253, 254, 255, 255, 255, 255, 255,},
-  {243, 254, 255, 254, 254, 255, 255, 255, 255, 255,},
-  {244, 255, 255, 254, 254, 255, 255, 255, 255, 255,},
-  {245, 255, 255, 254, 254, 255, 255, 255, 255, 255,},
-  {246, 255, 255, 254, 254, 255, 255, 255, 255, 255,},
-  {247, 255, 255, 255, 255, 255, 255, 255, 255, 255,},
-  {248, 255, 255, 255, 255, 255, 255, 255, 255, 255,},
-  {249, 255, 255, 255, 255, 255, 255, 255, 255, 255,},
-  {250, 255, 255, 255, 255, 255, 255, 255, 255, 255,},
-  {251, 255, 255, 255, 255, 255, 255, 255, 255, 255,},
-  {252, 255, 255, 255, 255, 255, 255, 255, 255, 255,},
-  {253, 255, 255, 255, 255, 255, 255, 255, 255, 255,},
-  {254, 255, 255, 255, 255, 255, 255, 255, 255, 255,},
-  {255, 255, 255, 255, 255, 255, 255, 255, 255, 255,},
-};
-
-const vp9_prob vp9_modelcoefprobs_gg75[COEFPROB_MODELS][ENTROPY_NODES - 1] = {
-  // Probs generated with a Generalized Gaussian (with shape parameter 0.75)
-  // source model with varying quantizer step size for a uniform quantizer
-  {0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},  // do not use
-  {1,   2,   6,  87, 129,  11,  88,  39,  93,  47,},
-  {2,   4,  11,  88, 130,  21,  89,  68,  98,  79,},
-  {3,   6,  16,  89, 131,  30,  91,  92, 103, 105,},
-  {4,   8,  21,  90, 131,  38,  92, 112, 107, 126,},
-  {5,  10,  26,  90, 132,  46,  94, 129, 111, 143,},
-  {6,  11,  31,  91, 133,  54,  95, 143, 115, 157,},
-  {7,  13,  35,  92, 133,  61,  96, 156, 119, 170,},
-  {8,  15,  40,  93, 134,  68,  97, 167, 123, 180,},
-  {9,  17,  44,  94, 134,  74,  98, 177, 126, 189,},
-  {10,  19,  48,  94, 135,  80, 100, 185, 130, 197,},
-  {11,  20,  52,  95, 135,  86, 101, 192, 133, 204,},
-  {12,  22,  56,  96, 136,  92, 102, 199, 137, 210,},
-  {13,  24,  60,  96, 136,  97, 103, 205, 140, 215,},
-  {14,  26,  64,  97, 137, 103, 104, 210, 143, 219,},
-  {15,  27,  68,  98, 137, 108, 105, 215, 146, 223,},
-  {16,  29,  71,  98, 138, 112, 106, 219, 149, 227,},
-  {17,  31,  75,  99, 138, 117, 107, 223, 152, 230,},
-  {18,  32,  78, 100, 139, 121, 108, 226, 155, 233,},
-  {19,  34,  82, 100, 139, 126, 109, 229, 158, 235,},
-  {20,  36,  85, 101, 140, 130, 110, 231, 161, 238,},
-  {21,  37,  88, 102, 140, 134, 111, 234, 164, 239,},
-  {22,  39,  91, 102, 141, 138, 112, 236, 167, 241,},
-  {23,  40,  94, 103, 141, 141, 113, 238, 169, 243,},
-  {24,  42,  97, 104, 142, 145, 114, 240, 172, 244,},
-  {25,  44, 100, 104, 142, 149, 115, 241, 174, 245,},
-  {26,  45, 103, 105, 143, 152, 116, 243, 177, 246,},
-  {27,  47, 106, 105, 143, 155, 117, 244, 179, 247,},
-  {28,  48, 109, 106, 143, 158, 118, 245, 182, 248,},
-  {29,  50, 112, 107, 144, 161, 119, 246, 184, 249,},
-  {30,  52, 115, 107, 144, 164, 120, 247, 186, 250,},
-  {31,  53, 117, 108, 145, 167, 121, 248, 188, 250,},
-  {32,  55, 120, 109, 145, 170, 122, 249, 190, 251,},
-  {33,  56, 122, 109, 146, 173, 123, 249, 192, 252,},
-  {34,  58, 125, 110, 146, 175, 124, 250, 194, 252,},
-  {35,  59, 127, 110, 147, 178, 125, 251, 196, 252,},
-  {36,  61, 130, 111, 147, 180, 126, 251, 198, 253,},
-  {37,  62, 132, 112, 147, 183, 127, 251, 200, 253,},
-  {38,  64, 135, 112, 148, 185, 128, 252, 202, 253,},
-  {39,  65, 137, 113, 148, 187, 129, 252, 204, 254,},
-  {40,  67, 139, 114, 149, 189, 130, 253, 205, 254,},
-  {41,  68, 141, 114, 149, 191, 131, 253, 207, 254,},
-  {42,  70, 144, 115, 150, 193, 132, 253, 209, 254,},
-  {43,  71, 146, 115, 150, 195, 133, 254, 210, 254,},
-  {44,  72, 148, 116, 151, 197, 134, 254, 212, 255,},
-  {45,  74, 150, 117, 151, 199, 135, 254, 213, 255,},
-  {46,  75, 152, 117, 151, 201, 136, 254, 215, 255,},
-  {47,  77, 154, 118, 152, 202, 137, 254, 216, 255,},
-  {48,  78, 156, 119, 152, 204, 138, 254, 217, 255,},
-  {49,  80, 158, 119, 153, 206, 139, 255, 219, 255,},
-  {50,  81, 160, 120, 153, 207, 140, 255, 220, 255,},
-  {51,  82, 162, 120, 154, 209, 141, 255, 221, 255,},
-  {52,  84, 164, 121, 154, 210, 142, 255, 222, 255,},
-  {53,  85, 165, 122, 155, 212, 143, 255, 224, 255,},
-  {54,  87, 167, 122, 155, 213, 144, 255, 225, 255,},
-  {55,  88, 169, 123, 155, 215, 145, 255, 226, 255,},
-  {56,  89, 171, 124, 156, 216, 146, 255, 227, 255,},
-  {57,  91, 172, 124, 156, 217, 146, 255, 228, 255,},
-  {58,  92, 174, 125, 157, 218, 147, 255, 229, 255,},
-  {59,  93, 176, 126, 157, 220, 148, 255, 230, 255,},
-  {60,  95, 177, 126, 158, 221, 149, 255, 231, 255,},
-  {61,  96, 179, 127, 158, 222, 150, 255, 232, 255,},
-  {62,  97, 180, 127, 159, 223, 151, 255, 232, 255,},
-  {63,  99, 182, 128, 159, 224, 152, 255, 233, 255,},
-  {64, 100, 183, 129, 159, 225, 153, 255, 234, 255,},
-  {65, 101, 185, 129, 160, 226, 154, 255, 235, 255,},
-  {66, 103, 186, 130, 160, 227, 155, 255, 236, 255,},
-  {67, 104, 188, 131, 161, 228, 156, 255, 236, 255,},
-  {68, 105, 189, 131, 161, 229, 157, 255, 237, 255,},
-  {69, 106, 190, 132, 162, 230, 158, 255, 238, 255,},
-  {70, 108, 192, 133, 162, 231, 159, 255, 238, 255,},
-  {71, 109, 193, 133, 162, 231, 159, 255, 239, 255,},
-  {72, 110, 194, 134, 163, 232, 160, 255, 240, 255,},
-  {73, 111, 196, 134, 163, 233, 161, 255, 240, 255,},
-  {74, 113, 197, 135, 164, 234, 162, 255, 241, 255,},
-  {75, 114, 198, 136, 164, 235, 163, 255, 241, 255,},
-  {76, 115, 199, 136, 165, 235, 164, 255, 242, 255,},
-  {77, 116, 200, 137, 165, 236, 165, 255, 243, 255,},
-  {78, 118, 202, 138, 166, 237, 166, 255, 243, 255,},
-  {79, 119, 203, 138, 166, 237, 167, 255, 244, 255,},
-  {80, 120, 204, 139, 167, 238, 168, 255, 244, 255,},
-  {81, 121, 205, 140, 167, 239, 168, 255, 244, 255,},
-  {82, 123, 206, 140, 167, 239, 169, 255, 245, 255,},
-  {83, 124, 207, 141, 168, 240, 170, 255, 245, 255,},
-  {84, 125, 208, 142, 168, 240, 171, 255, 246, 255,},
-  {85, 126, 209, 142, 169, 241, 172, 255, 246, 255,},
-  {86, 127, 210, 143, 169, 241, 173, 255, 247, 255,},
-  {87, 129, 211, 144, 170, 242, 174, 255, 247, 255,},
-  {88, 130, 212, 144, 170, 242, 175, 255, 247, 255,},
-  {89, 131, 213, 145, 171, 243, 175, 255, 248, 255,},
-  {90, 132, 214, 146, 171, 243, 176, 255, 248, 255,},
-  {91, 133, 215, 146, 171, 244, 177, 255, 248, 255,},
-  {92, 134, 216, 147, 172, 244, 178, 255, 249, 255,},
-  {93, 136, 217, 148, 172, 245, 179, 255, 249, 255,},
-  {94, 137, 218, 148, 173, 245, 180, 255, 249, 255,},
-  {95, 138, 219, 149, 173, 245, 181, 255, 249, 255,},
-  {96, 139, 220, 150, 174, 246, 181, 255, 250, 255,},
-  {97, 140, 220, 150, 174, 246, 182, 255, 250, 255,},
-  {98, 141, 221, 151, 175, 247, 183, 255, 250, 255,},
-  {99, 142, 222, 152, 175, 247, 184, 255, 250, 255,},
-  {100, 144, 223, 152, 176, 247, 185, 255, 251, 255,},
-  {101, 145, 224, 153, 176, 248, 186, 255, 251, 255,},
-  {102, 146, 224, 154, 177, 248, 186, 255, 251, 255,},
-  {103, 147, 225, 154, 177, 248, 187, 255, 251, 255,},
-  {104, 148, 226, 155, 177, 248, 188, 255, 252, 255,},
-  {105, 149, 226, 156, 178, 249, 189, 255, 252, 255,},
-  {106, 150, 227, 156, 178, 249, 190, 255, 252, 255,},
-  {107, 151, 228, 157, 179, 249, 191, 255, 252, 255,},
-  {108, 152, 229, 158, 179, 250, 191, 255, 252, 255,},
-  {109, 153, 229, 158, 180, 250, 192, 255, 252, 255,},
-  {110, 154, 230, 159, 180, 250, 193, 255, 253, 255,},
-  {111, 155, 231, 160, 181, 250, 194, 255, 253, 255,},
-  {112, 157, 231, 160, 181, 251, 195, 255, 253, 255,},
-  {113, 158, 232, 161, 182, 251, 195, 255, 253, 255,},
-  {114, 159, 232, 162, 182, 251, 196, 255, 253, 255,},
-  {115, 160, 233, 162, 183, 251, 197, 255, 253, 255,},
-  {116, 161, 234, 163, 183, 251, 198, 255, 253, 255,},
-  {117, 162, 234, 164, 184, 252, 198, 255, 254, 255,},
-  {118, 163, 235, 165, 184, 252, 199, 255, 254, 255,},
-  {119, 164, 235, 165, 185, 252, 200, 255, 254, 255,},
-  {120, 165, 236, 166, 185, 252, 201, 255, 254, 255,},
-  {121, 166, 236, 167, 186, 252, 201, 255, 254, 255,},
-  {122, 167, 237, 167, 186, 252, 202, 255, 254, 255,},
-  {123, 168, 237, 168, 186, 253, 203, 255, 254, 255,},
-  {124, 169, 238, 169, 187, 253, 204, 255, 254, 255,},
-  {125, 170, 238, 169, 187, 253, 204, 255, 254, 255,},
-  {126, 171, 239, 170, 188, 253, 205, 255, 254, 255,},
-  {127, 172, 239, 171, 188, 253, 206, 255, 254, 255,},
-  {128, 173, 240, 171, 189, 253, 207, 255, 255, 255,},
-  {129, 174, 240, 172, 189, 253, 207, 255, 255, 255,},
-  {130, 175, 241, 173, 190, 253, 208, 255, 255, 255,},
-  {131, 176, 241, 174, 190, 254, 209, 255, 255, 255,},
-  {132, 177, 241, 174, 191, 254, 209, 255, 255, 255,},
-  {133, 178, 242, 175, 191, 254, 210, 255, 255, 255,},
-  {134, 179, 242, 176, 192, 254, 211, 255, 255, 255,},
-  {135, 180, 243, 176, 192, 254, 212, 255, 255, 255,},
-  {136, 180, 243, 177, 193, 254, 212, 255, 255, 255,},
-  {137, 181, 243, 178, 193, 254, 213, 255, 255, 255,},
-  {138, 182, 244, 179, 194, 254, 214, 255, 255, 255,},
-  {139, 183, 244, 179, 194, 254, 214, 255, 255, 255,},
-  {140, 184, 244, 180, 195, 254, 215, 255, 255, 255,},
-  {141, 185, 245, 181, 195, 254, 216, 255, 255, 255,},
-  {142, 186, 245, 181, 196, 255, 216, 255, 255, 255,},
-  {143, 187, 245, 182, 196, 255, 217, 255, 255, 255,},
-  {144, 188, 246, 183, 197, 255, 218, 255, 255, 255,},
-  {145, 189, 246, 183, 197, 255, 218, 255, 255, 255,},
-  {146, 190, 246, 184, 198, 255, 219, 255, 255, 255,},
-  {147, 191, 247, 185, 198, 255, 220, 255, 255, 255,},
-  {148, 191, 247, 186, 199, 255, 220, 255, 255, 255,},
-  {149, 192, 247, 186, 199, 255, 221, 255, 255, 255,},
-  {150, 193, 248, 187, 200, 255, 221, 255, 255, 255,},
-  {151, 194, 248, 188, 200, 255, 222, 255, 255, 255,},
-  {152, 195, 248, 188, 201, 255, 223, 255, 255, 255,},
-  {153, 196, 248, 189, 201, 255, 223, 255, 255, 255,},
-  {154, 197, 249, 190, 202, 255, 224, 255, 255, 255,},
-  {155, 198, 249, 191, 202, 255, 224, 255, 255, 255,},
-  {156, 198, 249, 191, 203, 255, 225, 255, 255, 255,},
-  {157, 199, 249, 192, 203, 255, 226, 255, 255, 255,},
-  {158, 200, 250, 193, 204, 255, 226, 255, 255, 255,},
-  {159, 201, 250, 193, 204, 255, 227, 255, 255, 255,},
-  {160, 202, 250, 194, 205, 255, 227, 255, 255, 255,},
-  {161, 203, 250, 195, 206, 255, 228, 255, 255, 255,},
-  {162, 203, 250, 196, 206, 255, 228, 255, 255, 255,},
-  {163, 204, 251, 196, 207, 255, 229, 255, 255, 255,},
-  {164, 205, 251, 197, 207, 255, 229, 255, 255, 255,},
-  {165, 206, 251, 198, 208, 255, 230, 255, 255, 255,},
-  {166, 207, 251, 198, 208, 255, 231, 255, 255, 255,},
-  {167, 207, 251, 199, 209, 255, 231, 255, 255, 255,},
-  {168, 208, 252, 200, 209, 255, 232, 255, 255, 255,},
-  {169, 209, 252, 201, 210, 255, 232, 255, 255, 255,},
-  {170, 210, 252, 201, 210, 255, 233, 255, 255, 255,},
-  {171, 211, 252, 202, 211, 255, 233, 255, 255, 255,},
-  {172, 211, 252, 203, 211, 255, 234, 255, 255, 255,},
-  {173, 212, 252, 203, 212, 255, 234, 255, 255, 255,},
-  {174, 213, 252, 204, 212, 255, 235, 255, 255, 255,},
-  {175, 214, 253, 205, 213, 255, 235, 255, 255, 255,},
-  {176, 214, 253, 206, 213, 255, 236, 255, 255, 255,},
-  {177, 215, 253, 206, 214, 255, 236, 255, 255, 255,},
-  {178, 216, 253, 207, 214, 255, 237, 255, 255, 255,},
-  {179, 217, 253, 208, 215, 255, 237, 255, 255, 255,},
-  {180, 217, 253, 208, 216, 255, 237, 255, 255, 255,},
-  {181, 218, 253, 209, 216, 255, 238, 255, 255, 255,},
-  {182, 219, 254, 210, 217, 255, 238, 255, 255, 255,},
-  {183, 220, 254, 211, 217, 255, 239, 255, 255, 255,},
-  {184, 220, 254, 211, 218, 255, 239, 255, 255, 255,},
-  {185, 221, 254, 212, 218, 255, 240, 255, 255, 255,},
-  {186, 222, 254, 213, 219, 255, 240, 255, 255, 255,},
-  {187, 222, 254, 213, 219, 255, 241, 255, 255, 255,},
-  {188, 223, 254, 214, 220, 255, 241, 255, 255, 255,},
-  {189, 224, 254, 215, 220, 255, 241, 255, 255, 255,},
-  {190, 225, 254, 215, 221, 255, 242, 255, 255, 255,},
-  {191, 225, 254, 216, 221, 255, 242, 255, 255, 255,},
-  {192, 226, 254, 217, 222, 255, 243, 255, 255, 255,},
-  {193, 227, 255, 218, 223, 255, 243, 255, 255, 255,},
-  {194, 227, 255, 218, 223, 255, 243, 255, 255, 255,},
-  {195, 228, 255, 219, 224, 255, 244, 255, 255, 255,},
-  {196, 229, 255, 220, 224, 255, 244, 255, 255, 255,},
-  {197, 229, 255, 220, 225, 255, 244, 255, 255, 255,},
-  {198, 230, 255, 221, 225, 255, 245, 255, 255, 255,},
-  {199, 230, 255, 222, 226, 255, 245, 255, 255, 255,},
-  {200, 231, 255, 222, 226, 255, 246, 255, 255, 255,},
-  {201, 232, 255, 223, 227, 255, 246, 255, 255, 255,},
-  {202, 232, 255, 224, 228, 255, 246, 255, 255, 255,},
-  {203, 233, 255, 224, 228, 255, 247, 255, 255, 255,},
-  {204, 234, 255, 225, 229, 255, 247, 255, 255, 255,},
-  {205, 234, 255, 226, 229, 255, 247, 255, 255, 255,},
-  {206, 235, 255, 227, 230, 255, 248, 255, 255, 255,},
-  {207, 235, 255, 227, 230, 255, 248, 255, 255, 255,},
-  {208, 236, 255, 228, 231, 255, 248, 255, 255, 255,},
-  {209, 237, 255, 229, 231, 255, 248, 255, 255, 255,},
-  {210, 237, 255, 229, 232, 255, 249, 255, 255, 255,},
-  {211, 238, 255, 230, 233, 255, 249, 255, 255, 255,},
-  {212, 238, 255, 231, 233, 255, 249, 255, 255, 255,},
-  {213, 239, 255, 231, 234, 255, 250, 255, 255, 255,},
-  {214, 239, 255, 232, 234, 255, 250, 255, 255, 255,},
-  {215, 240, 255, 233, 235, 255, 250, 255, 255, 255,},
-  {216, 241, 255, 233, 235, 255, 250, 255, 255, 255,},
-  {217, 241, 255, 234, 236, 255, 251, 255, 255, 255,},
-  {218, 242, 255, 235, 236, 255, 251, 255, 255, 255,},
-  {219, 242, 255, 235, 237, 255, 251, 255, 255, 255,},
-  {220, 243, 255, 236, 237, 255, 251, 255, 255, 255,},
-  {221, 243, 255, 236, 238, 255, 252, 255, 255, 255,},
-  {222, 244, 255, 237, 239, 255, 252, 255, 255, 255,},
-  {223, 244, 255, 238, 239, 255, 252, 255, 255, 255,},
-  {224, 245, 255, 238, 240, 255, 252, 255, 255, 255,},
-  {225, 245, 255, 239, 240, 255, 252, 255, 255, 255,},
-  {226, 246, 255, 240, 241, 255, 253, 255, 255, 255,},
-  {227, 246, 255, 240, 241, 255, 253, 255, 255, 255,},
-  {228, 247, 255, 241, 242, 255, 253, 255, 255, 255,},
-  {229, 247, 255, 242, 242, 255, 253, 255, 255, 255,},
-  {230, 248, 255, 242, 243, 255, 253, 255, 255, 255,},
-  {231, 248, 255, 243, 244, 255, 254, 255, 255, 255,},
-  {232, 248, 255, 243, 244, 255, 254, 255, 255, 255,},
-  {233, 249, 255, 244, 245, 255, 254, 255, 255, 255,},
-  {234, 249, 255, 245, 245, 255, 254, 255, 255, 255,},
-  {235, 250, 255, 245, 246, 255, 254, 255, 255, 255,},
-  {236, 250, 255, 246, 246, 255, 254, 255, 255, 255,},
-  {237, 251, 255, 246, 247, 255, 255, 255, 255, 255,},
-  {238, 251, 255, 247, 247, 255, 255, 255, 255, 255,},
-  {239, 251, 255, 248, 248, 255, 255, 255, 255, 255,},
-  {240, 252, 255, 248, 248, 255, 255, 255, 255, 255,},
-  {241, 252, 255, 249, 249, 255, 255, 255, 255, 255,},
-  {242, 252, 255, 249, 249, 255, 255, 255, 255, 255,},
-  {243, 253, 255, 250, 250, 255, 255, 255, 255, 255,},
-  {244, 253, 255, 250, 250, 255, 255, 255, 255, 255,},
-  {245, 253, 255, 251, 251, 255, 255, 255, 255, 255,},
-  {246, 254, 255, 251, 251, 255, 255, 255, 255, 255,},
-  {247, 254, 255, 252, 252, 255, 255, 255, 255, 255,},
-  {248, 254, 255, 252, 252, 255, 255, 255, 255, 255,},
-  {249, 255, 255, 253, 253, 255, 255, 255, 255, 255,},
-  {250, 255, 255, 253, 253, 255, 255, 255, 255, 255,},
-  {251, 255, 255, 254, 254, 255, 255, 255, 255, 255,},
-  {252, 255, 255, 254, 254, 255, 255, 255, 255, 255,},
-  {253, 255, 255, 255, 255, 255, 255, 255, 255, 255,},
-  {254, 255, 255, 255, 255, 255, 255, 255, 255, 255,},
-  {255, 255, 255, 255, 255, 255, 255, 255, 255, 255,}
-};
-
-const vp9_prob vp9_modelcoefprobs_gg625[COEFPROB_MODELS][ENTROPY_NODES - 1] = {
-  // Probs generated with a Generalized Gaussian (with shape parameter 0.625)
-  // source model with varying quantizer step size for a uniform quantizer
-  {0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},  // do not use
-  {1,   2,   6,  88, 130,  10,  88,  35,  94,  40,},
-  {2,   4,  11,  89, 131,  19,  90,  60,  99,  67,},
-  {3,   6,  15,  90, 132,  27,  92,  80, 103,  88,},
-  {4,   7,  20,  91, 132,  34,  93,  97, 107, 105,},
-  {5,   9,  24,  92, 133,  41,  94, 112, 110, 120,},
-  {6,  11,  28,  93, 134,  48,  95, 125, 113, 132,},
-  {7,  13,  33,  93, 134,  54,  97, 136, 116, 143,},
-  {8,  14,  36,  94, 135,  60,  98, 146, 119, 152,},
-  {9,  16,  40,  95, 135,  65,  99, 155, 122, 161,},
-  {10,  18,  44,  95, 136,  70, 100, 163, 125, 168,},
-  {11,  19,  48,  96, 136,  75, 101, 170, 127, 175,},
-  {12,  21,  51,  97, 137,  80, 102, 176, 130, 181,},
-  {13,  23,  55,  97, 137,  85, 102, 182, 132, 187,},
-  {14,  24,  58,  98, 138,  89, 103, 188, 135, 192,},
-  {15,  26,  61,  99, 138,  94, 104, 193, 137, 196,},
-  {16,  27,  64,  99, 139,  98, 105, 197, 140, 201,},
-  {17,  29,  67, 100, 139, 102, 106, 201, 142, 205,},
-  {18,  30,  70, 101, 140, 106, 107, 205, 144, 208,},
-  {19,  32,  73, 101, 140, 109, 108, 209, 146, 211,},
-  {20,  34,  76, 102, 140, 113, 109, 212, 148, 214,},
-  {21,  35,  79, 102, 141, 116, 109, 215, 151, 217,},
-  {22,  37,  82, 103, 141, 120, 110, 218, 153, 220,},
-  {23,  38,  85, 103, 142, 123, 111, 220, 155, 222,},
-  {24,  40,  87, 104, 142, 126, 112, 223, 157, 224,},
-  {25,  41,  90, 105, 143, 129, 113, 225, 159, 226,},
-  {26,  42,  93, 105, 143, 132, 113, 227, 161, 228,},
-  {27,  44,  95, 106, 143, 135, 114, 229, 162, 230,},
-  {28,  45,  98, 106, 144, 138, 115, 230, 164, 232,},
-  {29,  47, 100, 107, 144, 141, 116, 232, 166, 233,},
-  {30,  48, 103, 107, 145, 144, 117, 234, 168, 235,},
-  {31,  50, 105, 108, 145, 146, 117, 235, 170, 236,},
-  {32,  51, 107, 108, 145, 149, 118, 236, 171, 237,},
-  {33,  52, 110, 109, 146, 151, 119, 238, 173, 238,},
-  {34,  54, 112, 110, 146, 154, 120, 239, 175, 239,},
-  {35,  55, 114, 110, 147, 156, 120, 240, 176, 240,},
-  {36,  57, 116, 111, 147, 158, 121, 241, 178, 241,},
-  {37,  58, 119, 111, 147, 161, 122, 242, 180, 242,},
-  {38,  59, 121, 112, 148, 163, 123, 243, 181, 243,},
-  {39,  61, 123, 112, 148, 165, 123, 244, 183, 244,},
-  {40,  62, 125, 113, 148, 167, 124, 244, 184, 245,},
-  {41,  63, 127, 113, 149, 169, 125, 245, 186, 245,},
-  {42,  65, 129, 114, 149, 171, 126, 246, 187, 246,},
-  {43,  66, 131, 114, 150, 173, 126, 246, 188, 247,},
-  {44,  67, 133, 115, 150, 175, 127, 247, 190, 247,},
-  {45,  69, 135, 115, 150, 177, 128, 247, 191, 248,},
-  {46,  70, 136, 116, 151, 178, 129, 248, 193, 248,},
-  {47,  71, 138, 116, 151, 180, 129, 248, 194, 249,},
-  {48,  73, 140, 117, 151, 182, 130, 249, 195, 249,},
-  {49,  74, 142, 118, 152, 184, 131, 249, 197, 250,},
-  {50,  75, 144, 118, 152, 185, 131, 250, 198, 250,},
-  {51,  76, 145, 119, 153, 187, 132, 250, 199, 250,},
-  {52,  78, 147, 119, 153, 188, 133, 251, 200, 251,},
-  {53,  79, 149, 120, 153, 190, 134, 251, 201, 251,},
-  {54,  80, 151, 120, 154, 192, 134, 251, 203, 251,},
-  {55,  82, 152, 121, 154, 193, 135, 251, 204, 252,},
-  {56,  83, 154, 121, 154, 194, 136, 252, 205, 252,},
-  {57,  84, 155, 122, 155, 196, 136, 252, 206, 252,},
-  {58,  85, 157, 122, 155, 197, 137, 252, 207, 252,},
-  {59,  86, 158, 123, 156, 199, 138, 252, 208, 252,},
-  {60,  88, 160, 123, 156, 200, 139, 253, 209, 253,},
-  {61,  89, 162, 124, 156, 201, 139, 253, 210, 253,},
-  {62,  90, 163, 124, 157, 202, 140, 253, 211, 253,},
-  {63,  91, 164, 125, 157, 204, 141, 253, 212, 253,},
-  {64,  93, 166, 125, 157, 205, 141, 253, 213, 253,},
-  {65,  94, 167, 126, 158, 206, 142, 254, 214, 254,},
-  {66,  95, 169, 126, 158, 207, 143, 254, 215, 254,},
-  {67,  96, 170, 127, 158, 208, 143, 254, 216, 254,},
-  {68,  97, 172, 127, 159, 209, 144, 254, 217, 254,},
-  {69,  98, 173, 128, 159, 210, 145, 254, 218, 254,},
-  {70, 100, 174, 128, 160, 212, 146, 254, 219, 254,},
-  {71, 101, 176, 129, 160, 213, 146, 254, 220, 254,},
-  {72, 102, 177, 130, 160, 214, 147, 254, 220, 254,},
-  {73, 103, 178, 130, 161, 215, 148, 255, 221, 255,},
-  {74, 104, 179, 131, 161, 216, 148, 255, 222, 255,},
-  {75, 105, 181, 131, 161, 217, 149, 255, 223, 255,},
-  {76, 107, 182, 132, 162, 217, 150, 255, 224, 255,},
-  {77, 108, 183, 132, 162, 218, 150, 255, 224, 255,},
-  {78, 109, 184, 133, 163, 219, 151, 255, 225, 255,},
-  {79, 110, 185, 133, 163, 220, 152, 255, 226, 255,},
-  {80, 111, 187, 134, 163, 221, 153, 255, 227, 255,},
-  {81, 112, 188, 134, 164, 222, 153, 255, 227, 255,},
-  {82, 113, 189, 135, 164, 223, 154, 255, 228, 255,},
-  {83, 115, 190, 135, 164, 223, 155, 255, 229, 255,},
-  {84, 116, 191, 136, 165, 224, 155, 255, 229, 255,},
-  {85, 117, 192, 136, 165, 225, 156, 255, 230, 255,},
-  {86, 118, 193, 137, 165, 226, 157, 255, 231, 255,},
-  {87, 119, 194, 137, 166, 226, 157, 255, 231, 255,},
-  {88, 120, 195, 138, 166, 227, 158, 255, 232, 255,},
-  {89, 121, 196, 139, 167, 228, 159, 255, 232, 255,},
-  {90, 122, 197, 139, 167, 229, 159, 255, 233, 255,},
-  {91, 123, 198, 140, 167, 229, 160, 255, 234, 255,},
-  {92, 124, 199, 140, 168, 230, 161, 255, 234, 255,},
-  {93, 125, 200, 141, 168, 231, 162, 255, 235, 255,},
-  {94, 127, 201, 141, 168, 231, 162, 255, 235, 255,},
-  {95, 128, 202, 142, 169, 232, 163, 255, 236, 255,},
-  {96, 129, 203, 142, 169, 232, 164, 255, 236, 255,},
-  {97, 130, 204, 143, 170, 233, 164, 255, 237, 255,},
-  {98, 131, 205, 143, 170, 234, 165, 255, 237, 255,},
-  {99, 132, 206, 144, 170, 234, 166, 255, 238, 255,},
-  {100, 133, 207, 144, 171, 235, 166, 255, 238, 255,},
-  {101, 134, 208, 145, 171, 235, 167, 255, 239, 255,},
-  {102, 135, 209, 146, 171, 236, 168, 255, 239, 255,},
-  {103, 136, 209, 146, 172, 236, 168, 255, 240, 255,},
-  {104, 137, 210, 147, 172, 237, 169, 255, 240, 255,},
-  {105, 138, 211, 147, 173, 237, 170, 255, 240, 255,},
-  {106, 139, 212, 148, 173, 238, 170, 255, 241, 255,},
-  {107, 140, 213, 148, 173, 238, 171, 255, 241, 255,},
-  {108, 141, 213, 149, 174, 239, 172, 255, 242, 255,},
-  {109, 142, 214, 149, 174, 239, 172, 255, 242, 255,},
-  {110, 143, 215, 150, 175, 240, 173, 255, 242, 255,},
-  {111, 144, 216, 151, 175, 240, 174, 255, 243, 255,},
-  {112, 145, 217, 151, 175, 240, 174, 255, 243, 255,},
-  {113, 146, 217, 152, 176, 241, 175, 255, 244, 255,},
-  {114, 147, 218, 152, 176, 241, 176, 255, 244, 255,},
-  {115, 148, 219, 153, 176, 242, 177, 255, 244, 255,},
-  {116, 149, 219, 153, 177, 242, 177, 255, 245, 255,},
-  {117, 150, 220, 154, 177, 242, 178, 255, 245, 255,},
-  {118, 151, 221, 155, 178, 243, 179, 255, 245, 255,},
-  {119, 152, 222, 155, 178, 243, 179, 255, 245, 255,},
-  {120, 153, 222, 156, 178, 244, 180, 255, 246, 255,},
-  {121, 154, 223, 156, 179, 244, 181, 255, 246, 255,},
-  {122, 155, 224, 157, 179, 244, 181, 255, 246, 255,},
-  {123, 156, 224, 157, 180, 245, 182, 255, 247, 255,},
-  {124, 157, 225, 158, 180, 245, 183, 255, 247, 255,},
-  {125, 158, 225, 159, 180, 245, 183, 255, 247, 255,},
-  {126, 159, 226, 159, 181, 246, 184, 255, 247, 255,},
-  {127, 160, 227, 160, 181, 246, 185, 255, 248, 255,},
-  {128, 161, 227, 160, 182, 246, 185, 255, 248, 255,},
-  {129, 162, 228, 161, 182, 246, 186, 255, 248, 255,},
-  {130, 163, 228, 161, 182, 247, 187, 255, 248, 255,},
-  {131, 164, 229, 162, 183, 247, 187, 255, 249, 255,},
-  {132, 165, 230, 163, 183, 247, 188, 255, 249, 255,},
-  {133, 166, 230, 163, 184, 248, 189, 255, 249, 255,},
-  {134, 166, 231, 164, 184, 248, 189, 255, 249, 255,},
-  {135, 167, 231, 164, 184, 248, 190, 255, 250, 255,},
-  {136, 168, 232, 165, 185, 248, 191, 255, 250, 255,},
-  {137, 169, 232, 166, 185, 248, 191, 255, 250, 255,},
-  {138, 170, 233, 166, 186, 249, 192, 255, 250, 255,},
-  {139, 171, 233, 167, 186, 249, 192, 255, 250, 255,},
-  {140, 172, 234, 167, 187, 249, 193, 255, 251, 255,},
-  {141, 173, 234, 168, 187, 249, 194, 255, 251, 255,},
-  {142, 174, 235, 169, 187, 250, 194, 255, 251, 255,},
-  {143, 175, 235, 169, 188, 250, 195, 255, 251, 255,},
-  {144, 176, 236, 170, 188, 250, 196, 255, 251, 255,},
-  {145, 177, 236, 170, 189, 250, 196, 255, 251, 255,},
-  {146, 177, 237, 171, 189, 250, 197, 255, 252, 255,},
-  {147, 178, 237, 172, 189, 251, 198, 255, 252, 255,},
-  {148, 179, 238, 172, 190, 251, 198, 255, 252, 255,},
-  {149, 180, 238, 173, 190, 251, 199, 255, 252, 255,},
-  {150, 181, 238, 173, 191, 251, 200, 255, 252, 255,},
-  {151, 182, 239, 174, 191, 251, 200, 255, 252, 255,},
-  {152, 183, 239, 175, 192, 251, 201, 255, 252, 255,},
-  {153, 184, 240, 175, 192, 252, 202, 255, 252, 255,},
-  {154, 184, 240, 176, 193, 252, 202, 255, 253, 255,},
-  {155, 185, 240, 177, 193, 252, 203, 255, 253, 255,},
-  {156, 186, 241, 177, 193, 252, 203, 255, 253, 255,},
-  {157, 187, 241, 178, 194, 252, 204, 255, 253, 255,},
-  {158, 188, 242, 178, 194, 252, 205, 255, 253, 255,},
-  {159, 189, 242, 179, 195, 252, 205, 255, 253, 255,},
-  {160, 190, 242, 180, 195, 253, 206, 255, 253, 255,},
-  {161, 190, 243, 180, 196, 253, 207, 255, 253, 255,},
-  {162, 191, 243, 181, 196, 253, 207, 255, 254, 255,},
-  {163, 192, 243, 182, 197, 253, 208, 255, 254, 255,},
-  {164, 193, 244, 182, 197, 253, 209, 255, 254, 255,},
-  {165, 194, 244, 183, 197, 253, 209, 255, 254, 255,},
-  {166, 195, 244, 184, 198, 253, 210, 255, 254, 255,},
-  {167, 196, 245, 184, 198, 253, 210, 255, 254, 255,},
-  {168, 196, 245, 185, 199, 253, 211, 255, 254, 255,},
-  {169, 197, 245, 186, 199, 254, 212, 255, 254, 255,},
-  {170, 198, 246, 186, 200, 254, 212, 255, 254, 255,},
-  {171, 199, 246, 187, 200, 254, 213, 255, 254, 255,},
-  {172, 200, 246, 188, 201, 254, 214, 255, 254, 255,},
-  {173, 200, 246, 188, 201, 254, 214, 255, 254, 255,},
-  {174, 201, 247, 189, 202, 254, 215, 255, 254, 255,},
-  {175, 202, 247, 189, 202, 254, 215, 255, 255, 255,},
-  {176, 203, 247, 190, 203, 254, 216, 255, 255, 255,},
-  {177, 204, 248, 191, 203, 254, 217, 255, 255, 255,},
-  {178, 204, 248, 191, 204, 254, 217, 255, 255, 255,},
-  {179, 205, 248, 192, 204, 254, 218, 255, 255, 255,},
-  {180, 206, 248, 193, 204, 254, 218, 255, 255, 255,},
-  {181, 207, 249, 194, 205, 255, 219, 255, 255, 255,},
-  {182, 208, 249, 194, 205, 255, 220, 255, 255, 255,},
-  {183, 208, 249, 195, 206, 255, 220, 255, 255, 255,},
-  {184, 209, 249, 196, 206, 255, 221, 255, 255, 255,},
-  {185, 210, 250, 196, 207, 255, 221, 255, 255, 255,},
-  {186, 211, 250, 197, 207, 255, 222, 255, 255, 255,},
-  {187, 211, 250, 198, 208, 255, 223, 255, 255, 255,},
-  {188, 212, 250, 198, 208, 255, 223, 255, 255, 255,},
-  {189, 213, 250, 199, 209, 255, 224, 255, 255, 255,},
-  {190, 214, 251, 200, 209, 255, 224, 255, 255, 255,},
-  {191, 215, 251, 200, 210, 255, 225, 255, 255, 255,},
-  {192, 215, 251, 201, 211, 255, 225, 255, 255, 255,},
-  {193, 216, 251, 202, 211, 255, 226, 255, 255, 255,},
-  {194, 217, 251, 203, 212, 255, 227, 255, 255, 255,},
-  {195, 218, 252, 203, 212, 255, 227, 255, 255, 255,},
-  {196, 218, 252, 204, 213, 255, 228, 255, 255, 255,},
-  {197, 219, 252, 205, 213, 255, 228, 255, 255, 255,},
-  {198, 220, 252, 205, 214, 255, 229, 255, 255, 255,},
-  {199, 221, 252, 206, 214, 255, 229, 255, 255, 255,},
-  {200, 221, 252, 207, 215, 255, 230, 255, 255, 255,},
-  {201, 222, 252, 208, 215, 255, 231, 255, 255, 255,},
-  {202, 223, 253, 208, 216, 255, 231, 255, 255, 255,},
-  {203, 223, 253, 209, 216, 255, 232, 255, 255, 255,},
-  {204, 224, 253, 210, 217, 255, 232, 255, 255, 255,},
-  {205, 225, 253, 211, 218, 255, 233, 255, 255, 255,},
-  {206, 226, 253, 211, 218, 255, 233, 255, 255, 255,},
-  {207, 226, 253, 212, 219, 255, 234, 255, 255, 255,},
-  {208, 227, 253, 213, 219, 255, 234, 255, 255, 255,},
-  {209, 228, 254, 214, 220, 255, 235, 255, 255, 255,},
-  {210, 228, 254, 214, 220, 255, 236, 255, 255, 255,},
-  {211, 229, 254, 215, 221, 255, 236, 255, 255, 255,},
-  {212, 230, 254, 216, 222, 255, 237, 255, 255, 255,},
-  {213, 230, 254, 217, 222, 255, 237, 255, 255, 255,},
-  {214, 231, 254, 217, 223, 255, 238, 255, 255, 255,},
-  {215, 232, 254, 218, 223, 255, 238, 255, 255, 255,},
-  {216, 233, 254, 219, 224, 255, 239, 255, 255, 255,},
-  {217, 233, 254, 220, 225, 255, 239, 255, 255, 255,},
-  {218, 234, 255, 220, 225, 255, 240, 255, 255, 255,},
-  {219, 235, 255, 221, 226, 255, 240, 255, 255, 255,},
-  {220, 235, 255, 222, 226, 255, 241, 255, 255, 255,},
-  {221, 236, 255, 223, 227, 255, 241, 255, 255, 255,},
-  {222, 237, 255, 224, 228, 255, 242, 255, 255, 255,},
-  {223, 237, 255, 224, 228, 255, 242, 255, 255, 255,},
-  {224, 238, 255, 225, 229, 255, 243, 255, 255, 255,},
-  {225, 238, 255, 226, 230, 255, 243, 255, 255, 255,},
-  {226, 239, 255, 227, 230, 255, 244, 255, 255, 255,},
-  {227, 240, 255, 228, 231, 255, 244, 255, 255, 255,},
-  {228, 240, 255, 228, 232, 255, 245, 255, 255, 255,},
-  {229, 241, 255, 229, 232, 255, 245, 255, 255, 255,},
-  {230, 242, 255, 230, 233, 255, 246, 255, 255, 255,},
-  {231, 242, 255, 231, 234, 255, 246, 255, 255, 255,},
-  {232, 243, 255, 232, 234, 255, 247, 255, 255, 255,},
-  {233, 243, 255, 233, 235, 255, 247, 255, 255, 255,},
-  {234, 244, 255, 233, 236, 255, 247, 255, 255, 255,},
-  {235, 245, 255, 234, 236, 255, 248, 255, 255, 255,},
-  {236, 245, 255, 235, 237, 255, 248, 255, 255, 255,},
-  {237, 246, 255, 236, 238, 255, 249, 255, 255, 255,},
-  {238, 247, 255, 237, 239, 255, 249, 255, 255, 255,},
-  {239, 247, 255, 238, 239, 255, 250, 255, 255, 255,},
-  {240, 248, 255, 239, 240, 255, 250, 255, 255, 255,},
-  {241, 248, 255, 240, 241, 255, 251, 255, 255, 255,},
-  {242, 249, 255, 241, 242, 255, 251, 255, 255, 255,},
-  {243, 249, 255, 241, 243, 255, 251, 255, 255, 255,},
-  {244, 250, 255, 242, 243, 255, 252, 255, 255, 255,},
-  {245, 251, 255, 243, 244, 255, 252, 255, 255, 255,},
-  {246, 251, 255, 244, 245, 255, 253, 255, 255, 255,},
-  {247, 252, 255, 245, 246, 255, 253, 255, 255, 255,},
-  {248, 252, 255, 246, 247, 255, 253, 255, 255, 255,},
-  {249, 253, 255, 247, 248, 255, 254, 255, 255, 255,},
-  {250, 253, 255, 248, 249, 255, 254, 255, 255, 255,},
-  {251, 254, 255, 249, 250, 255, 254, 255, 255, 255,},
-  {252, 254, 255, 251, 251, 255, 255, 255, 255, 255,},
-  {253, 255, 255, 252, 252, 255, 255, 255, 255, 255,},
-  {254, 255, 255, 253, 253, 255, 255, 255, 255, 255,},
-  {255, 255, 255, 254, 254, 255, 255, 255, 255, 255,},
+// beta = 8
+const vp9_prob vp9_modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = {
+  {  3,  86, 128,   6,  86,  23,  88,  29},
+  {  9,  86, 129,  17,  88,  61,  94,  76},
+  { 15,  87, 129,  28,  89,  93, 100, 110},
+  { 20,  88, 130,  38,  91, 118, 106, 136},
+  { 26,  89, 131,  48,  92, 139, 111, 156},
+  { 31,  90, 131,  58,  94, 156, 117, 171},
+  { 37,  90, 132,  66,  95, 171, 122, 184},
+  { 42,  91, 132,  75,  97, 183, 127, 194},
+  { 47,  92, 133,  83,  98, 193, 132, 202},
+  { 52,  93, 133,  90, 100, 201, 137, 208},
+  { 57,  94, 134,  98, 101, 208, 142, 214},
+  { 62,  94, 135, 105, 103, 214, 146, 218},
+  { 66,  95, 135, 111, 104, 219, 151, 222},
+  { 71,  96, 136, 117, 106, 224, 155, 225},
+  { 76,  97, 136, 123, 107, 227, 159, 228},
+  { 80,  98, 137, 129, 109, 231, 162, 231},
+  { 84,  98, 138, 134, 110, 234, 166, 233},
+  { 89,  99, 138, 140, 112, 236, 170, 235},
+  { 93, 100, 139, 145, 113, 238, 173, 236},
+  { 97, 101, 140, 149, 115, 240, 176, 238},
+  {101, 102, 140, 154, 116, 242, 179, 239},
+  {105, 103, 141, 158, 118, 243, 182, 240},
+  {109, 104, 141, 162, 119, 244, 185, 241},
+  {113, 104, 142, 166, 120, 245, 187, 242},
+  {116, 105, 143, 170, 122, 246, 190, 243},
+  {120, 106, 143, 173, 123, 247, 192, 244},
+  {123, 107, 144, 177, 125, 248, 195, 244},
+  {127, 108, 145, 180, 126, 249, 197, 245},
+  {130, 109, 145, 183, 128, 249, 199, 245},
+  {134, 110, 146, 186, 129, 250, 201, 246},
+  {137, 111, 147, 189, 131, 251, 203, 246},
+  {140, 112, 147, 192, 132, 251, 205, 247},
+  {143, 113, 148, 194, 133, 251, 207, 247},
+  {146, 114, 149, 197, 135, 252, 208, 248},
+  {149, 115, 149, 199, 136, 252, 210, 248},
+  {152, 115, 150, 201, 138, 252, 211, 248},
+  {155, 116, 151, 204, 139, 253, 213, 249},
+  {158, 117, 151, 206, 140, 253, 214, 249},
+  {161, 118, 152, 208, 142, 253, 216, 249},
+  {163, 119, 153, 210, 143, 253, 217, 249},
+  {166, 120, 153, 212, 144, 254, 218, 250},
+  {168, 121, 154, 213, 146, 254, 220, 250},
+  {171, 122, 155, 215, 147, 254, 221, 250},
+  {173, 123, 155, 217, 148, 254, 222, 250},
+  {176, 124, 156, 218, 150, 254, 223, 250},
+  {178, 125, 157, 220, 151, 254, 224, 251},
+  {180, 126, 157, 221, 152, 254, 225, 251},
+  {183, 127, 158, 222, 153, 254, 226, 251},
+  {185, 128, 159, 224, 155, 255, 227, 251},
+  {187, 129, 160, 225, 156, 255, 228, 251},
+  {189, 131, 160, 226, 157, 255, 228, 251},
+  {191, 132, 161, 227, 159, 255, 229, 251},
+  {193, 133, 162, 228, 160, 255, 230, 252},
+  {195, 134, 163, 230, 161, 255, 231, 252},
+  {197, 135, 163, 231, 162, 255, 231, 252},
+  {199, 136, 164, 232, 163, 255, 232, 252},
+  {201, 137, 165, 233, 165, 255, 233, 252},
+  {202, 138, 166, 233, 166, 255, 233, 252},
+  {204, 139, 166, 234, 167, 255, 234, 252},
+  {206, 140, 167, 235, 168, 255, 235, 252},
+  {207, 141, 168, 236, 169, 255, 235, 252},
+  {209, 142, 169, 237, 171, 255, 236, 252},
+  {210, 144, 169, 237, 172, 255, 236, 252},
+  {212, 145, 170, 238, 173, 255, 237, 252},
+  {214, 146, 171, 239, 174, 255, 237, 253},
+  {215, 147, 172, 240, 175, 255, 238, 253},
+  {216, 148, 173, 240, 176, 255, 238, 253},
+  {218, 149, 173, 241, 177, 255, 239, 253},
+  {219, 150, 174, 241, 179, 255, 239, 253},
+  {220, 152, 175, 242, 180, 255, 240, 253},
+  {222, 153, 176, 242, 181, 255, 240, 253},
+  {223, 154, 177, 243, 182, 255, 240, 253},
+  {224, 155, 178, 244, 183, 255, 241, 253},
+  {225, 156, 178, 244, 184, 255, 241, 253},
+  {226, 158, 179, 244, 185, 255, 242, 253},
+  {228, 159, 180, 245, 186, 255, 242, 253},
+  {229, 160, 181, 245, 187, 255, 242, 253},
+  {230, 161, 182, 246, 188, 255, 243, 253},
+  {231, 163, 183, 246, 189, 255, 243, 253},
+  {232, 164, 184, 247, 190, 255, 243, 253},
+  {233, 165, 185, 247, 191, 255, 244, 253},
+  {234, 166, 185, 247, 192, 255, 244, 253},
+  {235, 168, 186, 248, 193, 255, 244, 253},
+  {236, 169, 187, 248, 194, 255, 244, 253},
+  {236, 170, 188, 248, 195, 255, 245, 253},
+  {237, 171, 189, 249, 196, 255, 245, 254},
+  {238, 173, 190, 249, 197, 255, 245, 254},
+  {239, 174, 191, 249, 198, 255, 245, 254},
+  {240, 175, 192, 249, 199, 255, 246, 254},
+  {240, 177, 193, 250, 200, 255, 246, 254},
+  {241, 178, 194, 250, 201, 255, 246, 254},
+  {242, 179, 195, 250, 202, 255, 246, 254},
+  {242, 181, 196, 250, 203, 255, 247, 254},
+  {243, 182, 197, 251, 204, 255, 247, 254},
+  {244, 184, 198, 251, 205, 255, 247, 254},
+  {244, 185, 199, 251, 206, 255, 247, 254},
+  {245, 186, 200, 251, 207, 255, 247, 254},
+  {246, 188, 201, 252, 207, 255, 248, 254},
+  {246, 189, 202, 252, 208, 255, 248, 254},
+  {247, 191, 203, 252, 209, 255, 248, 254},
+  {247, 192, 204, 252, 210, 255, 248, 254},
+  {248, 194, 205, 252, 211, 255, 248, 254},
+  {248, 195, 206, 252, 212, 255, 249, 254},
+  {249, 197, 207, 253, 213, 255, 249, 254},
+  {249, 198, 208, 253, 214, 255, 249, 254},
+  {250, 200, 210, 253, 215, 255, 249, 254},
+  {250, 201, 211, 253, 215, 255, 249, 254},
+  {250, 203, 212, 253, 216, 255, 249, 254},
+  {251, 204, 213, 253, 217, 255, 250, 254},
+  {251, 206, 214, 254, 218, 255, 250, 254},
+  {252, 207, 216, 254, 219, 255, 250, 254},
+  {252, 209, 217, 254, 220, 255, 250, 254},
+  {252, 211, 218, 254, 221, 255, 250, 254},
+  {253, 213, 219, 254, 222, 255, 250, 254},
+  {253, 214, 221, 254, 223, 255, 250, 254},
+  {253, 216, 222, 254, 224, 255, 251, 254},
+  {253, 218, 224, 254, 225, 255, 251, 254},
+  {254, 220, 225, 254, 225, 255, 251, 254},
+  {254, 222, 227, 255, 226, 255, 251, 254},
+  {254, 224, 228, 255, 227, 255, 251, 254},
+  {254, 226, 230, 255, 228, 255, 251, 254},
+  {255, 228, 231, 255, 230, 255, 251, 254},
+  {255, 230, 233, 255, 231, 255, 252, 254},
+  {255, 232, 235, 255, 232, 255, 252, 254},
+  {255, 235, 237, 255, 233, 255, 252, 254},
+  {255, 238, 240, 255, 235, 255, 252, 255},
+  {255, 241, 243, 255, 236, 255, 252, 254},
+  {255, 246, 247, 255, 239, 255, 253, 255}
 };
 
-const vp9_prob vp9_modelcoefprobs_gg875p1[COEFPROB_MODELS][ENTROPY_NODES - 1] = {
-  // Probs generated with a Generalized Gaussian (with shape parameter 0.625)
-  // source model with varying quantizer step size for a uniform quantizer
-  {0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},  // do not use
-  {1,   1,   3,  86, 128,   6,  86,  22,  89,  28,},
-  {1,   2,   6,  86, 129,  11,  87,  42,  92,  52,},
-  {2,   3,   9,  87, 129,  17,  88,  59,  94,  73,},
-  {2,   4,  12,  87, 129,  22,  89,  75,  97,  92,},
-  {3,   5,  14,  88, 130,  27,  89,  90, 100, 108,},
-  {3,   6,  17,  88, 130,  33,  90, 103, 102, 122,},
-  {4,   7,  20,  88, 130,  37,  91, 115, 105, 135,},
-  {4,   8,  23,  89, 131,  42,  92, 126, 108, 147,},
-  {5,   9,  25,  89, 131,  47,  92, 137, 110, 157,},
-  {5,  10,  28,  90, 131,  52,  93, 146, 113, 167,},
-  {6,  11,  31,  90, 132,  56,  94, 154, 115, 175,},
-  {6,  12,  33,  90, 132,  60,  94, 162, 118, 183,},
-  {7,  13,  36,  91, 132,  65,  95, 170, 120, 190,},
-  {7,  14,  39,  91, 132,  69,  96, 176, 123, 196,},
-  {8,  15,  41,  92, 133,  73,  96, 182, 125, 201,},
-  {8,  16,  44,  92, 133,  77,  97, 188, 128, 206,},
-  {9,  17,  46,  92, 133,  81,  98, 193, 130, 211,},
-  {9,  18,  49,  93, 134,  85,  99, 198, 133, 215,},
-  {10,  19,  51,  93, 134,  89,  99, 203, 135, 219,},
-  {10,  20,  54,  93, 134,  92, 100, 207, 137, 222,},
-  {11,  21,  56,  94, 134,  96, 101, 211, 140, 226,},
-  {12,  22,  58,  94, 135, 100, 101, 214, 142, 228,},
-  {12,  23,  61,  95, 135, 103, 102, 217, 145, 231,},
-  {13,  24,  63,  95, 135, 106, 103, 220, 147, 233,},
-  {13,  25,  66,  95, 136, 110, 103, 223, 149, 235,},
-  {14,  26,  68,  96, 136, 113, 104, 226, 151, 237,},
-  {14,  27,  70,  96, 136, 116, 105, 228, 154, 239,},
-  {15,  28,  72,  97, 136, 119, 106, 230, 156, 241,},
-  {15,  29,  75,  97, 137, 122, 106, 232, 158, 242,},
-  {16,  30,  77,  97, 137, 125, 107, 234, 160, 243,},
-  {17,  31,  79,  98, 137, 128, 108, 236, 163, 245,},
-  {17,  32,  81,  98, 138, 131, 108, 237, 165, 246,},
-  {18,  33,  83,  99, 138, 134, 109, 239, 167, 247,},
-  {18,  34,  86,  99, 138, 137, 110, 240, 169, 248,},
-  {19,  35,  88,  99, 138, 140, 111, 242, 171, 248,},
-  {19,  36,  90, 100, 139, 142, 111, 243, 173, 249,},
-  {20,  37,  92, 100, 139, 145, 112, 244, 175, 250,},
-  {20,  38,  94, 101, 139, 148, 113, 245, 177, 250,},
-  {21,  39,  96, 101, 140, 150, 113, 246, 179, 251,},
-  {22,  40,  98, 101, 140, 153, 114, 246, 181, 251,},
-  {22,  41, 100, 102, 140, 155, 115, 247, 183, 252,},
-  {23,  42, 102, 102, 140, 157, 116, 248, 185, 252,},
-  {23,  43, 104, 103, 141, 160, 116, 249, 186, 253,},
-  {24,  44, 106, 103, 141, 162, 117, 249, 188, 253,},
-  {25,  45, 108, 103, 141, 164, 118, 250, 190, 253,},
-  {25,  46, 110, 104, 142, 166, 119, 250, 192, 253,},
-  {26,  47, 112, 104, 142, 168, 119, 251, 193, 254,},
-  {26,  48, 114, 105, 142, 171, 120, 251, 195, 254,},
-  {27,  49, 116, 105, 143, 173, 121, 252, 197, 254,},
-  {27,  50, 118, 105, 143, 175, 122, 252, 198, 254,},
-  {28,  51, 119, 106, 143, 177, 122, 252, 200, 254,},
-  {29,  52, 121, 106, 143, 179, 123, 253, 201, 255,},
-  {29,  53, 123, 107, 144, 180, 124, 253, 203, 255,},
-  {30,  54, 125, 107, 144, 182, 125, 253, 204, 255,},
-  {30,  55, 127, 108, 144, 184, 125, 253, 206, 255,},
-  {31,  56, 128, 108, 145, 186, 126, 254, 207, 255,},
-  {32,  57, 130, 108, 145, 188, 127, 254, 209, 255,},
-  {32,  58, 132, 109, 145, 189, 128, 254, 210, 255,},
-  {33,  59, 134, 109, 146, 191, 128, 254, 211, 255,},
-  {33,  60, 135, 110, 146, 193, 129, 254, 213, 255,},
-  {34,  61, 137, 110, 146, 194, 130, 254, 214, 255,},
-  {35,  62, 139, 111, 146, 196, 131, 255, 215, 255,},
-  {35,  63, 140, 111, 147, 197, 131, 255, 216, 255,},
-  {36,  64, 142, 112, 147, 199, 132, 255, 218, 255,},
-  {37,  65, 144, 112, 147, 200, 133, 255, 219, 255,},
-  {37,  66, 145, 112, 148, 202, 134, 255, 220, 255,},
-  {38,  67, 147, 113, 148, 203, 135, 255, 221, 255,},
-  {38,  68, 148, 113, 148, 204, 135, 255, 222, 255,},
-  {39,  69, 150, 114, 149, 206, 136, 255, 223, 255,},
-  {40,  70, 151, 114, 149, 207, 137, 255, 224, 255,},
-  {40,  71, 153, 115, 149, 208, 138, 255, 225, 255,},
-  {41,  72, 154, 115, 150, 210, 138, 255, 226, 255,},
-  {42,  73, 156, 116, 150, 211, 139, 255, 227, 255,},
-  {42,  74, 157, 116, 150, 212, 140, 255, 228, 255,},
-  {43,  75, 159, 117, 151, 213, 141, 255, 229, 255,},
-  {44,  76, 160, 117, 151, 214, 142, 255, 230, 255,},
-  {44,  77, 162, 117, 151, 216, 142, 255, 231, 255,},
-  {45,  78, 163, 118, 152, 217, 143, 255, 231, 255,},
-  {45,  79, 165, 118, 152, 218, 144, 255, 232, 255,},
-  {46,  80, 166, 119, 152, 219, 145, 255, 233, 255,},
-  {47,  81, 167, 119, 153, 220, 146, 255, 234, 255,},
-  {47,  82, 169, 120, 153, 221, 146, 255, 235, 255,},
-  {48,  83, 170, 120, 153, 222, 147, 255, 235, 255,},
-  {49,  84, 171, 121, 154, 223, 148, 255, 236, 255,},
-  {49,  85, 173, 121, 154, 224, 149, 255, 237, 255,},
-  {50,  86, 174, 122, 154, 225, 150, 255, 237, 255,},
-  {51,  87, 175, 122, 155, 225, 150, 255, 238, 255,},
-  {51,  88, 177, 123, 155, 226, 151, 255, 239, 255,},
-  {52,  89, 178, 123, 155, 227, 152, 255, 239, 255,},
-  {53,  90, 179, 124, 156, 228, 153, 255, 240, 255,},
-  {53,  91, 180, 124, 156, 229, 154, 255, 240, 255,},
-  {54,  92, 182, 125, 156, 230, 154, 255, 241, 255,},
-  {55,  93, 183, 125, 157, 230, 155, 255, 241, 255,},
-  {55,  94, 184, 126, 157, 231, 156, 255, 242, 255,},
-  {56,  95, 185, 126, 157, 232, 157, 255, 242, 255,},
-  {57,  96, 187, 127, 158, 233, 158, 255, 243, 255,},
-  {57,  97, 188, 127, 158, 233, 159, 255, 243, 255,},
-  {58,  98, 189, 128, 158, 234, 159, 255, 244, 255,},
-  {59,  99, 190, 128, 159, 235, 160, 255, 244, 255,},
-  {60, 100, 191, 129, 159, 235, 161, 255, 245, 255,},
-  {60, 101, 192, 129, 160, 236, 162, 255, 245, 255,},
-  {61, 102, 193, 130, 160, 237, 163, 255, 246, 255,},
-  {62, 103, 194, 131, 160, 237, 164, 255, 246, 255,},
-  {62, 104, 196, 131, 161, 238, 164, 255, 246, 255,},
-  {63, 105, 197, 132, 161, 238, 165, 255, 247, 255,},
-  {64, 106, 198, 132, 161, 239, 166, 255, 247, 255,},
-  {64, 107, 199, 133, 162, 239, 167, 255, 247, 255,},
-  {65, 108, 200, 133, 162, 240, 168, 255, 248, 255,},
-  {66, 109, 201, 134, 163, 241, 168, 255, 248, 255,},
-  {67, 110, 202, 134, 163, 241, 169, 255, 248, 255,},
-  {67, 111, 203, 135, 163, 242, 170, 255, 249, 255,},
-  {68, 112, 204, 135, 164, 242, 171, 255, 249, 255,},
-  {69, 113, 205, 136, 164, 242, 172, 255, 249, 255,},
-  {69, 114, 206, 137, 164, 243, 173, 255, 250, 255,},
-  {70, 115, 207, 137, 165, 243, 173, 255, 250, 255,},
-  {71, 116, 208, 138, 165, 244, 174, 255, 250, 255,},
-  {72, 117, 208, 138, 166, 244, 175, 255, 250, 255,},
-  {72, 118, 209, 139, 166, 245, 176, 255, 251, 255,},
-  {73, 119, 210, 139, 166, 245, 177, 255, 251, 255,},
-  {74, 120, 211, 140, 167, 245, 178, 255, 251, 255,},
-  {75, 121, 212, 141, 167, 246, 178, 255, 251, 255,},
-  {75, 122, 213, 141, 168, 246, 179, 255, 251, 255,},
-  {76, 123, 214, 142, 168, 246, 180, 255, 252, 255,},
-  {77, 124, 215, 142, 168, 247, 181, 255, 252, 255,},
-  {78, 125, 215, 143, 169, 247, 182, 255, 252, 255,},
-  {78, 126, 216, 144, 169, 247, 182, 255, 252, 255,},
-  {79, 127, 217, 144, 170, 248, 183, 255, 252, 255,},
-  {80, 128, 218, 145, 170, 248, 184, 255, 253, 255,},
-  {81, 129, 219, 145, 170, 248, 185, 255, 253, 255,},
-  {82, 130, 219, 146, 171, 249, 186, 255, 253, 255,},
-  {82, 131, 220, 147, 171, 249, 187, 255, 253, 255,},
-  {83, 132, 221, 147, 172, 249, 187, 255, 253, 255,},
-  {84, 133, 222, 148, 172, 249, 188, 255, 253, 255,},
-  {85, 134, 222, 148, 173, 250, 189, 255, 253, 255,},
-  {85, 135, 223, 149, 173, 250, 190, 255, 254, 255,},
-  {86, 136, 224, 150, 173, 250, 191, 255, 254, 255,},
-  {87, 137, 225, 150, 174, 250, 191, 255, 254, 255,},
-  {88, 138, 225, 151, 174, 251, 192, 255, 254, 255,},
-  {89, 139, 226, 152, 175, 251, 193, 255, 254, 255,},
-  {89, 140, 227, 152, 175, 251, 194, 255, 254, 255,},
-  {90, 141, 227, 153, 176, 251, 195, 255, 254, 255,},
-  {91, 142, 228, 153, 176, 251, 195, 255, 254, 255,},
-  {92, 143, 229, 154, 176, 252, 196, 255, 254, 255,},
-  {93, 144, 229, 155, 177, 252, 197, 255, 254, 255,},
-  {93, 145, 230, 155, 177, 252, 198, 255, 255, 255,},
-  {94, 146, 231, 156, 178, 252, 199, 255, 255, 255,},
-  {95, 147, 231, 157, 178, 252, 199, 255, 255, 255,},
-  {96, 148, 232, 157, 179, 252, 200, 255, 255, 255,},
-  {97, 149, 232, 158, 179, 253, 201, 255, 255, 255,},
-  {98, 150, 233, 159, 180, 253, 202, 255, 255, 255,},
-  {99, 151, 234, 159, 180, 253, 202, 255, 255, 255,},
-  {99, 152, 234, 160, 181, 253, 203, 255, 255, 255,},
-  {100, 153, 235, 161, 181, 253, 204, 255, 255, 255,},
-  {101, 154, 235, 162, 182, 253, 205, 255, 255, 255,},
-  {102, 155, 236, 162, 182, 253, 206, 255, 255, 255,},
-  {103, 156, 236, 163, 183, 254, 206, 255, 255, 255,},
-  {104, 157, 237, 164, 183, 254, 207, 255, 255, 255,},
-  {105, 158, 237, 164, 183, 254, 208, 255, 255, 255,},
-  {105, 159, 238, 165, 184, 254, 209, 255, 255, 255,},
-  {106, 160, 238, 166, 184, 254, 209, 255, 255, 255,},
-  {107, 161, 239, 166, 185, 254, 210, 255, 255, 255,},
-  {108, 162, 239, 167, 185, 254, 211, 255, 255, 255,},
-  {109, 163, 240, 168, 186, 254, 212, 255, 255, 255,},
-  {110, 164, 240, 169, 186, 254, 212, 255, 255, 255,},
-  {111, 165, 241, 169, 187, 254, 213, 255, 255, 255,},
-  {112, 166, 241, 170, 187, 255, 214, 255, 255, 255,},
-  {113, 167, 242, 171, 188, 255, 215, 255, 255, 255,},
-  {114, 168, 242, 172, 189, 255, 215, 255, 255, 255,},
-  {114, 169, 242, 172, 189, 255, 216, 255, 255, 255,},
-  {115, 170, 243, 173, 190, 255, 217, 255, 255, 255,},
-  {116, 171, 243, 174, 190, 255, 217, 255, 255, 255,},
-  {117, 172, 244, 175, 191, 255, 218, 255, 255, 255,},
-  {118, 173, 244, 175, 191, 255, 219, 255, 255, 255,},
-  {119, 174, 244, 176, 192, 255, 220, 255, 255, 255,},
-  {120, 175, 245, 177, 192, 255, 220, 255, 255, 255,},
-  {121, 176, 245, 178, 193, 255, 221, 255, 255, 255,},
-  {122, 177, 245, 178, 193, 255, 222, 255, 255, 255,},
-  {123, 178, 246, 179, 194, 255, 222, 255, 255, 255,},
-  {124, 179, 246, 180, 194, 255, 223, 255, 255, 255,},
-  {125, 180, 247, 181, 195, 255, 224, 255, 255, 255,},
-  {126, 181, 247, 182, 196, 255, 224, 255, 255, 255,},
-  {127, 182, 247, 182, 196, 255, 225, 255, 255, 255,},
-  {128, 183, 247, 183, 197, 255, 226, 255, 255, 255,},
-  {129, 184, 248, 184, 197, 255, 226, 255, 255, 255,},
-  {130, 185, 248, 185, 198, 255, 227, 255, 255, 255,},
-  {131, 186, 248, 186, 198, 255, 228, 255, 255, 255,},
-  {132, 187, 249, 186, 199, 255, 228, 255, 255, 255,},
-  {133, 188, 249, 187, 200, 255, 229, 255, 255, 255,},
-  {134, 189, 249, 188, 200, 255, 230, 255, 255, 255,},
-  {135, 190, 249, 189, 201, 255, 230, 255, 255, 255,},
-  {136, 191, 250, 190, 201, 255, 231, 255, 255, 255,},
-  {137, 192, 250, 191, 202, 255, 231, 255, 255, 255,},
-  {138, 193, 250, 191, 203, 255, 232, 255, 255, 255,},
-  {139, 194, 250, 192, 203, 255, 233, 255, 255, 255,},
-  {140, 195, 251, 193, 204, 255, 233, 255, 255, 255,},
-  {142, 196, 251, 194, 204, 255, 234, 255, 255, 255,},
-  {143, 197, 251, 195, 205, 255, 234, 255, 255, 255,},
-  {144, 198, 251, 196, 206, 255, 235, 255, 255, 255,},
-  {145, 199, 252, 197, 206, 255, 236, 255, 255, 255,},
-  {146, 200, 252, 197, 207, 255, 236, 255, 255, 255,},
-  {147, 201, 252, 198, 208, 255, 237, 255, 255, 255,},
-  {148, 202, 252, 199, 208, 255, 237, 255, 255, 255,},
-  {149, 203, 252, 200, 209, 255, 238, 255, 255, 255,},
-  {151, 204, 253, 201, 210, 255, 238, 255, 255, 255,},
-  {152, 205, 253, 202, 210, 255, 239, 255, 255, 255,},
-  {153, 206, 253, 203, 211, 255, 239, 255, 255, 255,},
-  {154, 207, 253, 204, 212, 255, 240, 255, 255, 255,},
-  {155, 208, 253, 205, 212, 255, 241, 255, 255, 255,},
-  {157, 209, 253, 206, 213, 255, 241, 255, 255, 255,},
-  {158, 210, 253, 206, 214, 255, 242, 255, 255, 255,},
-  {159, 211, 254, 207, 214, 255, 242, 255, 255, 255,},
-  {160, 212, 254, 208, 215, 255, 243, 255, 255, 255,},
-  {162, 213, 254, 209, 216, 255, 243, 255, 255, 255,},
-  {163, 214, 254, 210, 217, 255, 244, 255, 255, 255,},
-  {164, 215, 254, 211, 217, 255, 244, 255, 255, 255,},
-  {165, 216, 254, 212, 218, 255, 244, 255, 255, 255,},
-  {167, 217, 254, 213, 219, 255, 245, 255, 255, 255,},
-  {168, 218, 254, 214, 219, 255, 245, 255, 255, 255,},
-  {169, 219, 255, 215, 220, 255, 246, 255, 255, 255,},
-  {171, 220, 255, 216, 221, 255, 246, 255, 255, 255,},
-  {172, 221, 255, 217, 222, 255, 247, 255, 255, 255,},
-  {174, 222, 255, 218, 223, 255, 247, 255, 255, 255,},
-  {175, 223, 255, 219, 223, 255, 248, 255, 255, 255,},
-  {177, 224, 255, 220, 224, 255, 248, 255, 255, 255,},
-  {178, 225, 255, 221, 225, 255, 248, 255, 255, 255,},
-  {179, 226, 255, 222, 226, 255, 249, 255, 255, 255,},
-  {181, 227, 255, 223, 227, 255, 249, 255, 255, 255,},
-  {182, 228, 255, 224, 227, 255, 250, 255, 255, 255,},
-  {184, 229, 255, 225, 228, 255, 250, 255, 255, 255,},
-  {186, 230, 255, 226, 229, 255, 250, 255, 255, 255,},
-  {187, 231, 255, 227, 230, 255, 251, 255, 255, 255,},
-  {189, 232, 255, 228, 231, 255, 251, 255, 255, 255,},
-  {190, 233, 255, 229, 232, 255, 251, 255, 255, 255,},
-  {192, 234, 255, 230, 232, 255, 252, 255, 255, 255,},
-  {194, 235, 255, 231, 233, 255, 252, 255, 255, 255,},
-  {196, 236, 255, 232, 234, 255, 252, 255, 255, 255,},
-  {197, 237, 255, 233, 235, 255, 253, 255, 255, 255,},
-  {199, 238, 255, 234, 236, 255, 253, 255, 255, 255,},
-  {201, 239, 255, 235, 237, 255, 253, 255, 255, 255,},
-  {203, 240, 255, 237, 238, 255, 253, 255, 255, 255,},
-  {205, 241, 255, 238, 239, 255, 254, 255, 255, 255,},
-  {207, 242, 255, 239, 240, 255, 254, 255, 255, 255,},
-  {209, 243, 255, 240, 241, 255, 254, 255, 255, 255,},
-  {211, 244, 255, 241, 242, 255, 254, 255, 255, 255,},
-  {214, 245, 255, 242, 243, 255, 255, 255, 255, 255,},
-  {216, 246, 255, 243, 244, 255, 255, 255, 255, 255,},
-  {218, 247, 255, 244, 245, 255, 255, 255, 255, 255,},
-  {221, 248, 255, 246, 246, 255, 255, 255, 255, 255,},
-  {224, 249, 255, 247, 247, 255, 255, 255, 255, 255,},
-  {226, 250, 255, 248, 248, 255, 255, 255, 255, 255,},
-  {229, 251, 255, 249, 249, 255, 255, 255, 255, 255,},
-  {233, 252, 255, 251, 251, 255, 255, 255, 255, 255,},
-  {236, 253, 255, 252, 252, 255, 255, 255, 255, 255,},
-  {241, 254, 255, 253, 253, 255, 255, 255, 255, 255,},
-  {246, 255, 255, 254, 254, 255, 255, 255, 255, 255,},
-};
+static void extend_model_to_full_distribution(vp9_prob p,
+                                              vp9_prob *tree_probs) {
+  const int l = ((p - 1) / 2);
+  const vp9_prob (*model)[MODEL_NODES];
+  model = vp9_modelcoefprobs_pareto8;
+  if (p & 1) {
+    vpx_memcpy(tree_probs + UNCONSTRAINED_NODES,
+               model[l], MODEL_NODES * sizeof(vp9_prob));
+  } else {
+    // interpolate
+    int i;
+    for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
+      tree_probs[i] = (model[l][i - UNCONSTRAINED_NODES] +
+                       model[l + 1][i - UNCONSTRAINED_NODES]) >> 1;
+  }
+}
 
-const vp9_prob vp9_modelcoefprobs_gg75p1[COEFPROB_MODELS][ENTROPY_NODES - 1] = {
-  // Probs generated with a Generalized Gaussian (with shape parameter 0.625)
-  // source model with varying quantizer step size for a uniform quantizer
-  {0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},  // do not use
-  {1,   1,   3,  86, 129,   6,  87,  21,  90,  26,},
-  {1,   2,   6,  87, 129,  11,  88,  39,  93,  47,},
-  {2,   3,   9,  87, 130,  16,  89,  55,  96,  65,},
-  {2,   4,  11,  88, 130,  21,  89,  69,  98,  81,},
-  {3,   5,  14,  88, 130,  26,  90,  82, 101,  95,},
-  {3,   6,  17,  89, 131,  31,  91,  94, 103, 107,},
-  {4,   7,  20,  89, 131,  35,  92, 105, 105, 119,},
-  {4,   8,  22,  90, 131,  40,  92, 115, 108, 129,},
-  {5,   9,  25,  90, 132,  44,  93, 124, 110, 138,},
-  {5,  10,  27,  91, 132,  48,  94, 133, 112, 147,},
-  {6,  11,  30,  91, 132,  52,  95, 141, 114, 155,},
-  {6,  12,  32,  92, 133,  56,  95, 148, 116, 162,},
-  {7,  13,  35,  92, 133,  60,  96, 155, 118, 168,},
-  {7,  14,  37,  92, 133,  64,  97, 161, 121, 174,},
-  {8,  15,  40,  93, 134,  68,  97, 167, 123, 180,},
-  {9,  16,  42,  93, 134,  71,  98, 173, 125, 185,},
-  {9,  17,  44,  94, 134,  75,  99, 178, 127, 190,},
-  {10,  18,  47,  94, 135,  78,  99, 182, 129, 195,},
-  {10,  19,  49,  94, 135,  82, 100, 187, 131, 199,},
-  {11,  20,  51,  95, 135,  85, 100, 191, 133, 202,},
-  {11,  21,  54,  95, 135,  88, 101, 195, 135, 206,},
-  {12,  22,  56,  96, 136,  92, 102, 199, 137, 209,},
-  {13,  23,  58,  96, 136,  95, 102, 202, 138, 213,},
-  {13,  24,  61,  96, 136,  98, 103, 206, 140, 215,},
-  {14,  25,  63,  97, 137, 101, 104, 209, 142, 218,},
-  {14,  26,  65,  97, 137, 104, 104, 211, 144, 221,},
-  {15,  27,  67,  98, 137, 107, 105, 214, 146, 223,},
-  {15,  28,  69,  98, 138, 110, 106, 217, 148, 225,},
-  {16,  29,  71,  98, 138, 113, 106, 219, 150, 227,},
-  {17,  30,  73,  99, 138, 115, 107, 221, 151, 229,},
-  {17,  31,  76,  99, 138, 118, 107, 223, 153, 231,},
-  {18,  32,  78, 100, 139, 121, 108, 225, 155, 232,},
-  {18,  33,  80, 100, 139, 123, 109, 227, 157, 234,},
-  {19,  34,  82, 100, 139, 126, 109, 229, 158, 235,},
-  {20,  35,  84, 101, 140, 128, 110, 231, 160, 237,},
-  {20,  36,  86, 101, 140, 131, 111, 232, 162, 238,},
-  {21,  37,  88, 102, 140, 133, 111, 234, 164, 239,},
-  {21,  38,  90, 102, 140, 136, 112, 235, 165, 240,},
-  {22,  39,  92, 102, 141, 138, 112, 236, 167, 241,},
-  {23,  40,  94, 103, 141, 140, 113, 237, 169, 242,},
-  {23,  41,  95, 103, 141, 143, 114, 238, 170, 243,},
-  {24,  42,  97, 103, 142, 145, 114, 240, 172, 244,},
-  {25,  43,  99, 104, 142, 147, 115, 241, 173, 245,},
-  {25,  44, 101, 104, 142, 149, 116, 242, 175, 246,},
-  {26,  45, 103, 105, 142, 151, 116, 242, 176, 246,},
-  {26,  46, 105, 105, 143, 153, 117, 243, 178, 247,},
-  {27,  47, 107, 105, 143, 156, 117, 244, 180, 248,},
-  {28,  48, 108, 106, 143, 158, 118, 245, 181, 248,},
-  {28,  49, 110, 106, 144, 159, 119, 245, 182, 249,},
-  {29,  50, 112, 107, 144, 161, 119, 246, 184, 249,},
-  {30,  51, 114, 107, 144, 163, 120, 247, 185, 250,},
-  {30,  52, 115, 108, 144, 165, 121, 247, 187, 250,},
-  {31,  53, 117, 108, 145, 167, 121, 248, 188, 250,},
-  {32,  54, 119, 108, 145, 169, 122, 248, 190, 251,},
-  {32,  55, 121, 109, 145, 171, 123, 249, 191, 251,},
-  {33,  56, 122, 109, 146, 172, 123, 249, 192, 251,},
-  {34,  57, 124, 110, 146, 174, 124, 250, 194, 252,},
-  {34,  58, 126, 110, 146, 176, 125, 250, 195, 252,},
-  {35,  59, 127, 110, 147, 177, 125, 250, 196, 252,},
-  {36,  60, 129, 111, 147, 179, 126, 251, 197, 253,},
-  {36,  61, 130, 111, 147, 181, 127, 251, 199, 253,},
-  {37,  62, 132, 112, 147, 182, 127, 251, 200, 253,},
-  {38,  63, 134, 112, 148, 184, 128, 252, 201, 253,},
-  {38,  64, 135, 112, 148, 185, 128, 252, 202, 253,},
-  {39,  65, 137, 113, 148, 187, 129, 252, 204, 254,},
-  {40,  66, 138, 113, 149, 188, 130, 253, 205, 254,},
-  {40,  67, 140, 114, 149, 190, 130, 253, 206, 254,},
-  {41,  68, 141, 114, 149, 191, 131, 253, 207, 254,},
-  {42,  69, 143, 115, 150, 192, 132, 253, 208, 254,},
-  {42,  70, 144, 115, 150, 194, 132, 253, 209, 254,},
-  {43,  71, 146, 115, 150, 195, 133, 254, 210, 254,},
-  {44,  72, 147, 116, 150, 197, 134, 254, 211, 255,},
-  {44,  73, 149, 116, 151, 198, 134, 254, 212, 255,},
-  {45,  74, 150, 117, 151, 199, 135, 254, 213, 255,},
-  {46,  75, 152, 117, 151, 200, 136, 254, 214, 255,},
-  {46,  76, 153, 118, 152, 202, 136, 254, 215, 255,},
-  {47,  77, 154, 118, 152, 203, 137, 254, 216, 255,},
-  {48,  78, 156, 119, 152, 204, 138, 254, 217, 255,},
-  {49,  79, 157, 119, 153, 205, 139, 255, 218, 255,},
-  {49,  80, 159, 119, 153, 206, 139, 255, 219, 255,},
-  {50,  81, 160, 120, 153, 207, 140, 255, 220, 255,},
-  {51,  82, 161, 120, 154, 208, 141, 255, 221, 255,},
-  {51,  83, 163, 121, 154, 210, 141, 255, 222, 255,},
-  {52,  84, 164, 121, 154, 211, 142, 255, 223, 255,},
-  {53,  85, 165, 122, 154, 212, 143, 255, 223, 255,},
-  {54,  86, 166, 122, 155, 213, 143, 255, 224, 255,},
-  {54,  87, 168, 123, 155, 214, 144, 255, 225, 255,},
-  {55,  88, 169, 123, 155, 215, 145, 255, 226, 255,},
-  {56,  89, 170, 123, 156, 216, 145, 255, 227, 255,},
-  {57,  90, 172, 124, 156, 217, 146, 255, 227, 255,},
-  {57,  91, 173, 124, 156, 218, 147, 255, 228, 255,},
-  {58,  92, 174, 125, 157, 218, 147, 255, 229, 255,},
-  {59,  93, 175, 125, 157, 219, 148, 255, 230, 255,},
-  {60,  94, 176, 126, 157, 220, 149, 255, 230, 255,},
-  {60,  95, 178, 126, 158, 221, 150, 255, 231, 255,},
-  {61,  96, 179, 127, 158, 222, 150, 255, 232, 255,},
-  {62,  97, 180, 127, 158, 223, 151, 255, 232, 255,},
-  {63,  98, 181, 128, 159, 224, 152, 255, 233, 255,},
-  {63,  99, 182, 128, 159, 224, 152, 255, 234, 255,},
-  {64, 100, 183, 129, 159, 225, 153, 255, 234, 255,},
-  {65, 101, 184, 129, 160, 226, 154, 255, 235, 255,},
-  {66, 102, 186, 130, 160, 227, 154, 255, 235, 255,},
-  {66, 103, 187, 130, 160, 227, 155, 255, 236, 255,},
-  {67, 104, 188, 131, 161, 228, 156, 255, 236, 255,},
-  {68, 105, 189, 131, 161, 229, 157, 255, 237, 255,},
-  {69, 106, 190, 132, 161, 230, 157, 255, 238, 255,},
-  {69, 107, 191, 132, 162, 230, 158, 255, 238, 255,},
-  {70, 108, 192, 133, 162, 231, 159, 255, 239, 255,},
-  {71, 109, 193, 133, 163, 232, 159, 255, 239, 255,},
-  {72, 110, 194, 134, 163, 232, 160, 255, 240, 255,},
-  {73, 111, 195, 134, 163, 233, 161, 255, 240, 255,},
-  {73, 112, 196, 135, 164, 233, 162, 255, 241, 255,},
-  {74, 113, 197, 135, 164, 234, 162, 255, 241, 255,},
-  {75, 114, 198, 136, 164, 235, 163, 255, 241, 255,},
-  {76, 115, 199, 136, 165, 235, 164, 255, 242, 255,},
-  {77, 116, 200, 137, 165, 236, 165, 255, 242, 255,},
-  {77, 117, 201, 137, 165, 236, 165, 255, 243, 255,},
-  {78, 118, 202, 138, 166, 237, 166, 255, 243, 255,},
-  {79, 119, 203, 138, 166, 237, 167, 255, 244, 255,},
-  {80, 120, 204, 139, 166, 238, 167, 255, 244, 255,},
-  {81, 121, 205, 139, 167, 238, 168, 255, 244, 255,},
-  {82, 122, 206, 140, 167, 239, 169, 255, 245, 255,},
-  {82, 123, 206, 141, 168, 239, 170, 255, 245, 255,},
-  {83, 124, 207, 141, 168, 240, 170, 255, 245, 255,},
-  {84, 125, 208, 142, 168, 240, 171, 255, 246, 255,},
-  {85, 126, 209, 142, 169, 241, 172, 255, 246, 255,},
-  {86, 127, 210, 143, 169, 241, 173, 255, 246, 255,},
-  {87, 128, 211, 143, 169, 242, 173, 255, 247, 255,},
-  {87, 129, 212, 144, 170, 242, 174, 255, 247, 255,},
-  {88, 130, 212, 144, 170, 242, 175, 255, 247, 255,},
-  {89, 131, 213, 145, 171, 243, 176, 255, 248, 255,},
-  {90, 132, 214, 146, 171, 243, 176, 255, 248, 255,},
-  {91, 133, 215, 146, 171, 244, 177, 255, 248, 255,},
-  {92, 134, 216, 147, 172, 244, 178, 255, 248, 255,},
-  {93, 135, 216, 147, 172, 244, 179, 255, 249, 255,},
-  {93, 136, 217, 148, 173, 245, 179, 255, 249, 255,},
-  {94, 137, 218, 148, 173, 245, 180, 255, 249, 255,},
-  {95, 138, 219, 149, 173, 245, 181, 255, 249, 255,},
-  {96, 139, 220, 150, 174, 246, 181, 255, 250, 255,},
-  {97, 140, 220, 150, 174, 246, 182, 255, 250, 255,},
-  {98, 141, 221, 151, 175, 246, 183, 255, 250, 255,},
-  {99, 142, 222, 151, 175, 247, 184, 255, 250, 255,},
-  {100, 143, 222, 152, 175, 247, 184, 255, 251, 255,},
-  {100, 144, 223, 153, 176, 247, 185, 255, 251, 255,},
-  {101, 145, 224, 153, 176, 248, 186, 255, 251, 255,},
-  {102, 146, 224, 154, 177, 248, 187, 255, 251, 255,},
-  {103, 147, 225, 154, 177, 248, 187, 255, 251, 255,},
-  {104, 148, 226, 155, 178, 248, 188, 255, 252, 255,},
-  {105, 149, 226, 156, 178, 249, 189, 255, 252, 255,},
-  {106, 150, 227, 156, 178, 249, 190, 255, 252, 255,},
-  {107, 151, 228, 157, 179, 249, 190, 255, 252, 255,},
-  {108, 152, 228, 158, 179, 249, 191, 255, 252, 255,},
-  {109, 153, 229, 158, 180, 250, 192, 255, 252, 255,},
-  {110, 154, 230, 159, 180, 250, 193, 255, 253, 255,},
-  {111, 155, 230, 159, 181, 250, 193, 255, 253, 255,},
-  {111, 156, 231, 160, 181, 250, 194, 255, 253, 255,},
-  {112, 157, 231, 161, 181, 251, 195, 255, 253, 255,},
-  {113, 158, 232, 161, 182, 251, 196, 255, 253, 255,},
-  {114, 159, 233, 162, 182, 251, 196, 255, 253, 255,},
-  {115, 160, 233, 163, 183, 251, 197, 255, 253, 255,},
-  {116, 161, 234, 163, 183, 251, 198, 255, 253, 255,},
-  {117, 162, 234, 164, 184, 252, 199, 255, 254, 255,},
-  {118, 163, 235, 165, 184, 252, 199, 255, 254, 255,},
-  {119, 164, 235, 165, 185, 252, 200, 255, 254, 255,},
-  {120, 165, 236, 166, 185, 252, 201, 255, 254, 255,},
-  {121, 166, 236, 167, 186, 252, 202, 255, 254, 255,},
-  {122, 167, 237, 167, 186, 252, 202, 255, 254, 255,},
-  {123, 168, 237, 168, 187, 253, 203, 255, 254, 255,},
-  {124, 169, 238, 169, 187, 253, 204, 255, 254, 255,},
-  {125, 170, 238, 169, 188, 253, 205, 255, 254, 255,},
-  {126, 171, 239, 170, 188, 253, 205, 255, 254, 255,},
-  {127, 172, 239, 171, 189, 253, 206, 255, 254, 255,},
-  {128, 173, 240, 172, 189, 253, 207, 255, 255, 255,},
-  {129, 174, 240, 172, 190, 253, 208, 255, 255, 255,},
-  {130, 175, 241, 173, 190, 253, 208, 255, 255, 255,},
-  {131, 176, 241, 174, 191, 254, 209, 255, 255, 255,},
-  {132, 177, 242, 175, 191, 254, 210, 255, 255, 255,},
-  {133, 178, 242, 175, 192, 254, 210, 255, 255, 255,},
-  {134, 179, 242, 176, 192, 254, 211, 255, 255, 255,},
-  {135, 180, 243, 177, 193, 254, 212, 255, 255, 255,},
-  {137, 181, 243, 177, 193, 254, 213, 255, 255, 255,},
-  {138, 182, 244, 178, 194, 254, 213, 255, 255, 255,},
-  {139, 183, 244, 179, 194, 254, 214, 255, 255, 255,},
-  {140, 184, 244, 180, 195, 254, 215, 255, 255, 255,},
-  {141, 185, 245, 181, 195, 254, 216, 255, 255, 255,},
-  {142, 186, 245, 181, 196, 255, 216, 255, 255, 255,},
-  {143, 187, 245, 182, 196, 255, 217, 255, 255, 255,},
-  {144, 188, 246, 183, 197, 255, 218, 255, 255, 255,},
-  {145, 189, 246, 184, 197, 255, 218, 255, 255, 255,},
-  {146, 190, 247, 184, 198, 255, 219, 255, 255, 255,},
-  {147, 191, 247, 185, 199, 255, 220, 255, 255, 255,},
-  {149, 192, 247, 186, 199, 255, 221, 255, 255, 255,},
-  {150, 193, 247, 187, 200, 255, 221, 255, 255, 255,},
-  {151, 194, 248, 188, 200, 255, 222, 255, 255, 255,},
-  {152, 195, 248, 188, 201, 255, 223, 255, 255, 255,},
-  {153, 196, 248, 189, 201, 255, 223, 255, 255, 255,},
-  {154, 197, 249, 190, 202, 255, 224, 255, 255, 255,},
-  {156, 198, 249, 191, 203, 255, 225, 255, 255, 255,},
-  {157, 199, 249, 192, 203, 255, 225, 255, 255, 255,},
-  {158, 200, 250, 193, 204, 255, 226, 255, 255, 255,},
-  {159, 201, 250, 193, 205, 255, 227, 255, 255, 255,},
-  {160, 202, 250, 194, 205, 255, 227, 255, 255, 255,},
-  {162, 203, 250, 195, 206, 255, 228, 255, 255, 255,},
-  {163, 204, 251, 196, 206, 255, 229, 255, 255, 255,},
-  {164, 205, 251, 197, 207, 255, 229, 255, 255, 255,},
-  {165, 206, 251, 198, 208, 255, 230, 255, 255, 255,},
-  {166, 207, 251, 199, 208, 255, 231, 255, 255, 255,},
-  {168, 208, 251, 200, 209, 255, 231, 255, 255, 255,},
-  {169, 209, 252, 201, 210, 255, 232, 255, 255, 255,},
-  {170, 210, 252, 201, 210, 255, 233, 255, 255, 255,},
-  {172, 211, 252, 202, 211, 255, 233, 255, 255, 255,},
-  {173, 212, 252, 203, 212, 255, 234, 255, 255, 255,},
-  {174, 213, 252, 204, 212, 255, 235, 255, 255, 255,},
-  {175, 214, 253, 205, 213, 255, 235, 255, 255, 255,},
-  {177, 215, 253, 206, 214, 255, 236, 255, 255, 255,},
-  {178, 216, 253, 207, 215, 255, 237, 255, 255, 255,},
-  {179, 217, 253, 208, 215, 255, 237, 255, 255, 255,},
-  {181, 218, 253, 209, 216, 255, 238, 255, 255, 255,},
-  {182, 219, 254, 210, 217, 255, 238, 255, 255, 255,},
-  {184, 220, 254, 211, 217, 255, 239, 255, 255, 255,},
-  {185, 221, 254, 212, 218, 255, 240, 255, 255, 255,},
-  {186, 222, 254, 213, 219, 255, 240, 255, 255, 255,},
-  {188, 223, 254, 214, 220, 255, 241, 255, 255, 255,},
-  {189, 224, 254, 215, 221, 255, 241, 255, 255, 255,},
-  {191, 225, 254, 216, 221, 255, 242, 255, 255, 255,},
-  {192, 226, 254, 217, 222, 255, 243, 255, 255, 255,},
-  {194, 227, 255, 218, 223, 255, 243, 255, 255, 255,},
-  {195, 228, 255, 219, 224, 255, 244, 255, 255, 255,},
-  {197, 229, 255, 220, 225, 255, 244, 255, 255, 255,},
-  {198, 230, 255, 221, 225, 255, 245, 255, 255, 255,},
-  {200, 231, 255, 222, 226, 255, 245, 255, 255, 255,},
-  {201, 232, 255, 223, 227, 255, 246, 255, 255, 255,},
-  {203, 233, 255, 224, 228, 255, 247, 255, 255, 255,},
-  {205, 234, 255, 226, 229, 255, 247, 255, 255, 255,},
-  {206, 235, 255, 227, 230, 255, 248, 255, 255, 255,},
-  {208, 236, 255, 228, 231, 255, 248, 255, 255, 255,},
-  {210, 237, 255, 229, 232, 255, 249, 255, 255, 255,},
-  {211, 238, 255, 230, 233, 255, 249, 255, 255, 255,},
-  {213, 239, 255, 231, 234, 255, 250, 255, 255, 255,},
-  {215, 240, 255, 233, 235, 255, 250, 255, 255, 255,},
-  {217, 241, 255, 234, 236, 255, 251, 255, 255, 255,},
-  {219, 242, 255, 235, 237, 255, 251, 255, 255, 255,},
-  {221, 243, 255, 236, 238, 255, 252, 255, 255, 255,},
-  {223, 244, 255, 237, 239, 255, 252, 255, 255, 255,},
-  {225, 245, 255, 239, 240, 255, 252, 255, 255, 255,},
-  {227, 246, 255, 240, 241, 255, 253, 255, 255, 255,},
-  {229, 247, 255, 241, 242, 255, 253, 255, 255, 255,},
-  {231, 248, 255, 243, 244, 255, 254, 255, 255, 255,},
-  {233, 249, 255, 244, 245, 255, 254, 255, 255, 255,},
-  {236, 250, 255, 246, 246, 255, 254, 255, 255, 255,},
-  {238, 251, 255, 247, 247, 255, 255, 255, 255, 255,},
-  {241, 252, 255, 249, 249, 255, 255, 255, 255, 255,},
-  {244, 253, 255, 250, 250, 255, 255, 255, 255, 255,},
-  {247, 254, 255, 252, 252, 255, 255, 255, 255, 255,},
-  {251, 255, 255, 254, 254, 255, 255, 255, 255, 255,},
-};
+void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) {
+  if (full != model)
+    vpx_memcpy(full, model, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+  extend_model_to_full_distribution(model[PIVOT_NODE], full);
+}
 
-const vp9_prob vp9_modelcoefprobs_gg625p1[COEFPROB_MODELS][ENTROPY_NODES - 1] = {
-  // Probs generated with a Generalized Gaussian (with shape parameter 0.625)
-  // source model with varying quantizer step size for a uniform quantizer
-  {0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},  // do not use
-  {1,   1,   3,  87, 129,   6,  87,  20,  91,  24,},
-  {1,   2,   6,  88, 130,  11,  89,  36,  94,  41,},
-  {2,   3,   8,  88, 130,  15,  90,  50,  97,  56,},
-  {2,   4,  11,  89, 131,  20,  90,  62,  99,  70,},
-  {3,   5,  14,  90, 131,  24,  91,  74, 102,  81,},
-  {3,   6,  16,  90, 132,  29,  92,  84, 104,  92,},
-  {4,   7,  19,  91, 132,  33,  93,  93, 106, 101,},
-  {4,   8,  21,  91, 132,  37,  93, 102, 108, 110,},
-  {5,   9,  24,  92, 133,  40,  94, 110, 110, 118,},
-  {5,  10,  26,  92, 133,  44,  95, 118, 111, 125,},
-  {6,  11,  29,  93, 134,  48,  96, 125, 113, 132,},
-  {7,  12,  31,  93, 134,  51,  96, 132, 115, 139,},
-  {7,  13,  33,  93, 134,  55,  97, 138, 117, 145,},
-  {8,  14,  36,  94, 135,  58,  97, 144, 119, 150,},
-  {8,  15,  38,  94, 135,  62,  98, 149, 120, 155,},
-  {9,  16,  40,  95, 135,  65,  99, 154, 122, 160,},
-  {10,  17,  42,  95, 136,  68,  99, 159, 124, 165,},
-  {10,  18,  45,  96, 136,  71, 100, 164, 125, 169,},
-  {11,  19,  47,  96, 136,  74, 100, 168, 127, 174,},
-  {11,  20,  49,  96, 136,  77, 101, 173, 128, 177,},
-  {12,  21,  51,  97, 137,  80, 102, 176, 130, 181,},
-  {13,  22,  53,  97, 137,  83, 102, 180, 131, 185,},
-  {13,  23,  55,  98, 137,  86, 103, 184, 133, 188,},
-  {14,  24,  57,  98, 138,  89, 103, 187, 135, 191,},
-  {14,  25,  59,  98, 138,  91, 104, 190, 136, 194,},
-  {15,  26,  61,  99, 138,  94, 104, 193, 138, 197,},
-  {16,  27,  64,  99, 139,  97, 105, 196, 139, 200,},
-  {16,  28,  66, 100, 139,  99, 106, 199, 141, 202,},
-  {17,  29,  68, 100, 139, 102, 106, 201, 142, 205,},
-  {18,  30,  69, 100, 139, 104, 107, 204, 143, 207,},
-  {18,  31,  71, 101, 140, 107, 107, 206, 145, 209,},
-  {19,  32,  73, 101, 140, 109, 108, 209, 146, 211,},
-  {20,  33,  75, 102, 140, 112, 108, 211, 148, 213,},
-  {20,  34,  77, 102, 141, 114, 109, 213, 149, 215,},
-  {21,  35,  79, 102, 141, 116, 109, 215, 150, 217,},
-  {22,  36,  81, 103, 141, 119, 110, 217, 152, 219,},
-  {22,  37,  83, 103, 141, 121, 110, 218, 153, 220,},
-  {23,  38,  85, 103, 142, 123, 111, 220, 155, 222,},
-  {24,  39,  87, 104, 142, 125, 112, 222, 156, 224,},
-  {24,  40,  88, 104, 142, 127, 112, 223, 157, 225,},
-  {25,  41,  90, 105, 143, 129, 113, 225, 159, 226,},
-  {26,  42,  92, 105, 143, 131, 113, 226, 160, 228,},
-  {26,  43,  94, 105, 143, 133, 114, 227, 161, 229,},
-  {27,  44,  95, 106, 143, 135, 114, 229, 162, 230,},
-  {28,  45,  97, 106, 144, 137, 115, 230, 164, 231,},
-  {28,  46,  99, 107, 144, 139, 115, 231, 165, 232,},
-  {29,  47, 101, 107, 144, 141, 116, 232, 166, 233,},
-  {30,  48, 102, 107, 145, 143, 116, 233, 168, 234,},
-  {31,  49, 104, 108, 145, 145, 117, 234, 169, 235,},
-  {31,  50, 106, 108, 145, 147, 118, 235, 170, 236,},
-  {32,  51, 107, 108, 145, 149, 118, 236, 171, 237,},
-  {33,  52, 109, 109, 146, 150, 119, 237, 172, 238,},
-  {33,  53, 111, 109, 146, 152, 119, 238, 174, 239,},
-  {34,  54, 112, 110, 146, 154, 120, 239, 175, 240,},
-  {35,  55, 114, 110, 146, 156, 120, 240, 176, 240,},
-  {36,  56, 115, 110, 147, 157, 121, 240, 177, 241,},
-  {36,  57, 117, 111, 147, 159, 121, 241, 178, 242,},
-  {37,  58, 119, 111, 147, 161, 122, 242, 180, 242,},
-  {38,  59, 120, 112, 148, 162, 122, 242, 181, 243,},
-  {38,  60, 122, 112, 148, 164, 123, 243, 182, 244,},
-  {39,  61, 123, 112, 148, 165, 124, 244, 183, 244,},
-  {40,  62, 125, 113, 148, 167, 124, 244, 184, 245,},
-  {41,  63, 126, 113, 149, 168, 125, 245, 185, 245,},
-  {41,  64, 128, 114, 149, 170, 125, 245, 186, 246,},
-  {42,  65, 129, 114, 149, 171, 126, 246, 187, 246,},
-  {43,  66, 131, 114, 150, 173, 126, 246, 188, 247,},
-  {44,  67, 132, 115, 150, 174, 127, 247, 189, 247,},
-  {44,  68, 134, 115, 150, 176, 127, 247, 191, 247,},
-  {45,  69, 135, 116, 150, 177, 128, 248, 192, 248,},
-  {46,  70, 136, 116, 151, 178, 129, 248, 193, 248,},
-  {47,  71, 138, 116, 151, 180, 129, 248, 194, 249,},
-  {48,  72, 139, 117, 151, 181, 130, 249, 195, 249,},
-  {48,  73, 141, 117, 152, 183, 130, 249, 196, 249,},
-  {49,  74, 142, 118, 152, 184, 131, 249, 197, 250,},
-  {50,  75, 143, 118, 152, 185, 131, 250, 198, 250,},
-  {51,  76, 145, 118, 152, 186, 132, 250, 199, 250,},
-  {51,  77, 146, 119, 153, 188, 132, 250, 200, 250,},
-  {52,  78, 148, 119, 153, 189, 133, 251, 201, 251,},
-  {53,  79, 149, 120, 153, 190, 134, 251, 201, 251,},
-  {54,  80, 150, 120, 154, 191, 134, 251, 202, 251,},
-  {55,  81, 151, 120, 154, 192, 135, 251, 203, 251,},
-  {55,  82, 153, 121, 154, 194, 135, 252, 204, 252,},
-  {56,  83, 154, 121, 155, 195, 136, 252, 205, 252,},
-  {57,  84, 155, 122, 155, 196, 136, 252, 206, 252,},
-  {58,  85, 157, 122, 155, 197, 137, 252, 207, 252,},
-  {59,  86, 158, 123, 155, 198, 138, 252, 208, 252,},
-  {59,  87, 159, 123, 156, 199, 138, 253, 209, 253,},
-  {60,  88, 160, 123, 156, 200, 139, 253, 210, 253,},
-  {61,  89, 162, 124, 156, 201, 139, 253, 210, 253,},
-  {62,  90, 163, 124, 157, 202, 140, 253, 211, 253,},
-  {63,  91, 164, 125, 157, 203, 140, 253, 212, 253,},
-  {64,  92, 165, 125, 157, 204, 141, 253, 213, 253,},
-  {64,  93, 166, 126, 158, 205, 142, 254, 214, 253,},
-  {65,  94, 168, 126, 158, 206, 142, 254, 214, 254,},
-  {66,  95, 169, 126, 158, 207, 143, 254, 215, 254,},
-  {67,  96, 170, 127, 158, 208, 143, 254, 216, 254,},
-  {68,  97, 171, 127, 159, 209, 144, 254, 217, 254,},
-  {69,  98, 172, 128, 159, 210, 145, 254, 218, 254,},
-  {69,  99, 173, 128, 159, 211, 145, 254, 218, 254,},
-  {70, 100, 175, 129, 160, 212, 146, 254, 219, 254,},
-  {71, 101, 176, 129, 160, 213, 146, 254, 220, 254,},
-  {72, 102, 177, 130, 160, 214, 147, 254, 220, 254,},
-  {73, 103, 178, 130, 161, 214, 148, 255, 221, 255,},
-  {74, 104, 179, 130, 161, 215, 148, 255, 222, 255,},
-  {75, 105, 180, 131, 161, 216, 149, 255, 223, 255,},
-  {75, 106, 181, 131, 162, 217, 149, 255, 223, 255,},
-  {76, 107, 182, 132, 162, 218, 150, 255, 224, 255,},
-  {77, 108, 183, 132, 162, 219, 151, 255, 225, 255,},
-  {78, 109, 184, 133, 163, 219, 151, 255, 225, 255,},
-  {79, 110, 185, 133, 163, 220, 152, 255, 226, 255,},
-  {80, 111, 186, 134, 163, 221, 152, 255, 226, 255,},
-  {81, 112, 187, 134, 164, 222, 153, 255, 227, 255,},
-  {82, 113, 188, 135, 164, 222, 154, 255, 228, 255,},
-  {83, 114, 189, 135, 164, 223, 154, 255, 228, 255,},
-  {83, 115, 190, 136, 165, 224, 155, 255, 229, 255,},
-  {84, 116, 191, 136, 165, 224, 156, 255, 230, 255,},
-  {85, 117, 192, 137, 165, 225, 156, 255, 230, 255,},
-  {86, 118, 193, 137, 166, 226, 157, 255, 231, 255,},
-  {87, 119, 194, 137, 166, 226, 157, 255, 231, 255,},
-  {88, 120, 195, 138, 166, 227, 158, 255, 232, 255,},
-  {89, 121, 196, 138, 167, 228, 159, 255, 232, 255,},
-  {90, 122, 197, 139, 167, 228, 159, 255, 233, 255,},
-  {91, 123, 198, 139, 167, 229, 160, 255, 233, 255,},
-  {92, 124, 199, 140, 168, 230, 161, 255, 234, 255,},
-  {93, 125, 200, 140, 168, 230, 161, 255, 234, 255,},
-  {93, 126, 201, 141, 168, 231, 162, 255, 235, 255,},
-  {94, 127, 202, 141, 169, 231, 163, 255, 235, 255,},
-  {95, 128, 203, 142, 169, 232, 163, 255, 236, 255,},
-  {96, 129, 203, 142, 169, 233, 164, 255, 236, 255,},
-  {97, 130, 204, 143, 170, 233, 164, 255, 237, 255,},
-  {98, 131, 205, 143, 170, 234, 165, 255, 237, 255,},
-  {99, 132, 206, 144, 170, 234, 166, 255, 238, 255,},
-  {100, 133, 207, 145, 171, 235, 166, 255, 238, 255,},
-  {101, 134, 208, 145, 171, 235, 167, 255, 239, 255,},
-  {102, 135, 209, 146, 171, 236, 168, 255, 239, 255,},
-  {103, 136, 209, 146, 172, 236, 168, 255, 240, 255,},
-  {104, 137, 210, 147, 172, 237, 169, 255, 240, 255,},
-  {105, 138, 211, 147, 173, 237, 170, 255, 240, 255,},
-  {106, 139, 212, 148, 173, 238, 170, 255, 241, 255,},
-  {107, 140, 213, 148, 173, 238, 171, 255, 241, 255,},
-  {108, 141, 213, 149, 174, 239, 172, 255, 242, 255,},
-  {109, 142, 214, 149, 174, 239, 172, 255, 242, 255,},
-  {110, 143, 215, 150, 174, 240, 173, 255, 242, 255,},
-  {111, 144, 216, 150, 175, 240, 174, 255, 243, 255,},
-  {112, 145, 216, 151, 175, 240, 174, 255, 243, 255,},
-  {113, 146, 217, 152, 176, 241, 175, 255, 243, 255,},
-  {114, 147, 218, 152, 176, 241, 176, 255, 244, 255,},
-  {115, 148, 219, 153, 176, 242, 176, 255, 244, 255,},
-  {116, 149, 219, 153, 177, 242, 177, 255, 244, 255,},
-  {117, 150, 220, 154, 177, 242, 178, 255, 245, 255,},
-  {118, 151, 221, 154, 178, 243, 178, 255, 245, 255,},
-  {119, 152, 221, 155, 178, 243, 179, 255, 245, 255,},
-  {120, 153, 222, 156, 178, 244, 180, 255, 246, 255,},
-  {121, 154, 223, 156, 179, 244, 180, 255, 246, 255,},
-  {122, 155, 223, 157, 179, 244, 181, 255, 246, 255,},
-  {123, 156, 224, 157, 180, 245, 182, 255, 247, 255,},
-  {124, 157, 225, 158, 180, 245, 183, 255, 247, 255,},
-  {125, 158, 225, 159, 180, 245, 183, 255, 247, 255,},
-  {126, 159, 226, 159, 181, 246, 184, 255, 247, 255,},
-  {127, 160, 227, 160, 181, 246, 185, 255, 248, 255,},
-  {128, 161, 227, 160, 182, 246, 185, 255, 248, 255,},
-  {129, 162, 228, 161, 182, 246, 186, 255, 248, 255,},
-  {130, 163, 229, 162, 183, 247, 187, 255, 248, 255,},
-  {131, 164, 229, 162, 183, 247, 187, 255, 249, 255,},
-  {132, 165, 230, 163, 183, 247, 188, 255, 249, 255,},
-  {133, 166, 230, 163, 184, 248, 189, 255, 249, 255,},
-  {135, 167, 231, 164, 184, 248, 190, 255, 249, 255,},
-  {136, 168, 232, 165, 185, 248, 190, 255, 250, 255,},
-  {137, 169, 232, 165, 185, 248, 191, 255, 250, 255,},
-  {138, 170, 233, 166, 186, 249, 192, 255, 250, 255,},
-  {139, 171, 233, 167, 186, 249, 192, 255, 250, 255,},
-  {140, 172, 234, 167, 187, 249, 193, 255, 251, 255,},
-  {141, 173, 234, 168, 187, 249, 194, 255, 251, 255,},
-  {142, 174, 235, 169, 187, 250, 195, 255, 251, 255,},
-  {143, 175, 235, 169, 188, 250, 195, 255, 251, 255,},
-  {144, 176, 236, 170, 188, 250, 196, 255, 251, 255,},
-  {146, 177, 236, 171, 189, 250, 197, 255, 251, 255,},
-  {147, 178, 237, 171, 189, 251, 197, 255, 252, 255,},
-  {148, 179, 237, 172, 190, 251, 198, 255, 252, 255,},
-  {149, 180, 238, 173, 190, 251, 199, 255, 252, 255,},
-  {150, 181, 238, 173, 191, 251, 200, 255, 252, 255,},
-  {151, 182, 239, 174, 191, 251, 200, 255, 252, 255,},
-  {152, 183, 239, 175, 192, 251, 201, 255, 252, 255,},
-  {153, 184, 240, 176, 192, 252, 202, 255, 253, 255,},
-  {155, 185, 240, 176, 193, 252, 203, 255, 253, 255,},
-  {156, 186, 241, 177, 193, 252, 203, 255, 253, 255,},
-  {157, 187, 241, 178, 194, 252, 204, 255, 253, 255,},
-  {158, 188, 242, 179, 194, 252, 205, 255, 253, 255,},
-  {159, 189, 242, 179, 195, 252, 206, 255, 253, 255,},
-  {160, 190, 242, 180, 195, 253, 206, 255, 253, 255,},
-  {162, 191, 243, 181, 196, 253, 207, 255, 253, 255,},
-  {163, 192, 243, 182, 196, 253, 208, 255, 254, 255,},
-  {164, 193, 244, 182, 197, 253, 209, 255, 254, 255,},
-  {165, 194, 244, 183, 198, 253, 209, 255, 254, 255,},
-  {166, 195, 244, 184, 198, 253, 210, 255, 254, 255,},
-  {168, 196, 245, 185, 199, 253, 211, 255, 254, 255,},
-  {169, 197, 245, 185, 199, 254, 212, 255, 254, 255,},
-  {170, 198, 246, 186, 200, 254, 212, 255, 254, 255,},
-  {171, 199, 246, 187, 200, 254, 213, 255, 254, 255,},
-  {172, 200, 246, 188, 201, 254, 214, 255, 254, 255,},
-  {174, 201, 247, 189, 201, 254, 215, 255, 254, 255,},
-  {175, 202, 247, 189, 202, 254, 215, 255, 255, 255,},
-  {176, 203, 247, 190, 203, 254, 216, 255, 255, 255,},
-  {177, 204, 248, 191, 203, 254, 217, 255, 255, 255,},
-  {179, 205, 248, 192, 204, 254, 218, 255, 255, 255,},
-  {180, 206, 248, 193, 204, 254, 218, 255, 255, 255,},
-  {181, 207, 249, 194, 205, 255, 219, 255, 255, 255,},
-  {183, 208, 249, 195, 206, 255, 220, 255, 255, 255,},
-  {184, 209, 249, 195, 206, 255, 221, 255, 255, 255,},
-  {185, 210, 250, 196, 207, 255, 221, 255, 255, 255,},
-  {186, 211, 250, 197, 208, 255, 222, 255, 255, 255,},
-  {188, 212, 250, 198, 208, 255, 223, 255, 255, 255,},
-  {189, 213, 250, 199, 209, 255, 224, 255, 255, 255,},
-  {190, 214, 251, 200, 210, 255, 224, 255, 255, 255,},
-  {192, 215, 251, 201, 210, 255, 225, 255, 255, 255,},
-  {193, 216, 251, 202, 211, 255, 226, 255, 255, 255,},
-  {194, 217, 251, 203, 212, 255, 227, 255, 255, 255,},
-  {196, 218, 252, 204, 212, 255, 228, 255, 255, 255,},
-  {197, 219, 252, 205, 213, 255, 228, 255, 255, 255,},
-  {198, 220, 252, 206, 214, 255, 229, 255, 255, 255,},
-  {200, 221, 252, 207, 215, 255, 230, 255, 255, 255,},
-  {201, 222, 252, 208, 215, 255, 231, 255, 255, 255,},
-  {202, 223, 253, 209, 216, 255, 231, 255, 255, 255,},
-  {204, 224, 253, 210, 217, 255, 232, 255, 255, 255,},
-  {205, 225, 253, 211, 218, 255, 233, 255, 255, 255,},
-  {207, 226, 253, 212, 218, 255, 234, 255, 255, 255,},
-  {208, 227, 253, 213, 219, 255, 234, 255, 255, 255,},
-  {209, 228, 254, 214, 220, 255, 235, 255, 255, 255,},
-  {211, 229, 254, 215, 221, 255, 236, 255, 255, 255,},
-  {212, 230, 254, 216, 222, 255, 237, 255, 255, 255,},
-  {214, 231, 254, 217, 223, 255, 238, 255, 255, 255,},
-  {215, 232, 254, 218, 223, 255, 238, 255, 255, 255,},
-  {217, 233, 254, 219, 224, 255, 239, 255, 255, 255,},
-  {218, 234, 255, 221, 225, 255, 240, 255, 255, 255,},
-  {220, 235, 255, 222, 226, 255, 241, 255, 255, 255,},
-  {221, 236, 255, 223, 227, 255, 241, 255, 255, 255,},
-  {223, 237, 255, 224, 228, 255, 242, 255, 255, 255,},
-  {224, 238, 255, 225, 229, 255, 243, 255, 255, 255,},
-  {226, 239, 255, 227, 230, 255, 244, 255, 255, 255,},
-  {227, 240, 255, 228, 231, 255, 244, 255, 255, 255,},
-  {229, 241, 255, 229, 232, 255, 245, 255, 255, 255,},
-  {231, 242, 255, 231, 233, 255, 246, 255, 255, 255,},
-  {232, 243, 255, 232, 234, 255, 247, 255, 255, 255,},
-  {234, 244, 255, 233, 236, 255, 247, 255, 255, 255,},
-  {235, 245, 255, 235, 237, 255, 248, 255, 255, 255,},
-  {237, 246, 255, 236, 238, 255, 249, 255, 255, 255,},
-  {239, 247, 255, 238, 239, 255, 250, 255, 255, 255,},
-  {241, 248, 255, 239, 241, 255, 250, 255, 255, 255,},
-  {242, 249, 255, 241, 242, 255, 251, 255, 255, 255,},
-  {244, 250, 255, 243, 243, 255, 252, 255, 255, 255,},
-  {246, 251, 255, 244, 245, 255, 253, 255, 255, 255,},
-  {248, 252, 255, 246, 247, 255, 253, 255, 255, 255,},
-  {250, 253, 255, 248, 248, 255, 254, 255, 255, 255,},
-  {252, 254, 255, 250, 250, 255, 255, 255, 255, 255,},
-  {254, 255, 255, 253, 253, 255, 255, 255, 255, 255,},
-};
-
-void vp9_get_model_distribution(vp9_prob p, vp9_prob *tree_probs,
-                                int b, int r) {
-  const vp9_prob (*model)[ENTROPY_NODES - 1];
-#if UNCONSTRAINED_NODES == 2
-  if (r != INTRA_FRAME && b == PLANE_TYPE_UV)
-    model = vp9_modelcoefprobs_gg75;
-  else if (r == INTRA_FRAME && b == PLANE_TYPE_UV)
-    model = vp9_modelcoefprobs_gg75;
-  else if (r != INTRA_FRAME && b == PLANE_TYPE_Y_WITH_DC)
-    model = vp9_modelcoefprobs_gg75;
-  else
-    model = vp9_modelcoefprobs_gg75;
-#else
-  if (r != INTRA_FRAME && b == PLANE_TYPE_UV)
-    model = vp9_modelcoefprobs_gg75p1;
-  else if (r == INTRA_FRAME && b == PLANE_TYPE_UV)
-    model = vp9_modelcoefprobs_gg75p1;
-  else if (r != INTRA_FRAME && b == PLANE_TYPE_Y_WITH_DC)
-    model = vp9_modelcoefprobs_gg75p1;
-  else
-    model = vp9_modelcoefprobs_gg75p1;
-#endif
-  vpx_memcpy(tree_probs + UNCONSTRAINED_NODES,
-             model[p] + UNCONSTRAINED_NODES - 1,
-             (ENTROPY_NODES - UNCONSTRAINED_NODES) * sizeof(vp9_prob));
+void vp9_model_to_full_probs_sb(
+    vp9_prob model[COEF_BANDS][PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES],
+    vp9_prob full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]) {
+  int c, p;
+  for (c = 0; c < COEF_BANDS; ++c)
+    for (p = 0; p < PREV_COEF_CONTEXTS; ++p) {
+      vp9_model_to_full_probs(model[c][p], full[c][p]);
+    }
 }
-#endif
 
 static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
 
@@ -2077,7 +439,7 @@
   init_bit_tree(cat6, 14);
 }
 
-vp9_extra_bit_struct vp9_extra_bits[12] = {
+vp9_extra_bit vp9_extra_bits[12] = {
   { 0, 0, 0, 0},
   { 0, 0, 0, 1},
   { 0, 0, 0, 2},
@@ -2111,177 +473,32 @@
     int ctx;
     assert(neighbors[MAX_NEIGHBORS * c + 0] >= 0);
     if (neighbors[MAX_NEIGHBORS * c + 1] >= 0) {
-      ctx = (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
-             token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
+      ctx = (1 + token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]] +
+             token_cache[scan[neighbors[MAX_NEIGHBORS * c + 1]]]) >> 1;
     } else {
-      ctx = token_cache[neighbors[MAX_NEIGHBORS * c + 0]];
+      ctx = token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]];
     }
-    return vp9_pt_energy_class[ctx];
+    return ctx;
   }
 };
 
 void vp9_default_coef_probs(VP9_COMMON *pc) {
-#if CONFIG_MODELCOEFPROB
-  int b, r, c, p;
-#endif
-#if CONFIG_CODE_NONZEROCOUNT
-#ifdef NZC_DEFAULT_COUNTS
-  int h, g;
-  for (h = 0; h < MAX_NZC_CONTEXTS; ++h) {
-    for (g = 0; g < REF_TYPES; ++g) {
-      int i;
-      unsigned int branch_ct4x4[NZC4X4_NODES][2];
-      unsigned int branch_ct8x8[NZC8X8_NODES][2];
-      unsigned int branch_ct16x16[NZC16X16_NODES][2];
-      unsigned int branch_ct32x32[NZC32X32_NODES][2];
-      for (i = 0; i < BLOCK_TYPES; ++i) {
-        vp9_tree_probs_from_distribution(
-          vp9_nzc4x4_tree,
-          pc->fc.nzc_probs_4x4[h][g][i], branch_ct4x4,
-          default_nzc_counts_4x4[h][g][i], 0);
-      }
-      for (i = 0; i < BLOCK_TYPES; ++i) {
-        vp9_tree_probs_from_distribution(
-          vp9_nzc8x8_tree,
-          pc->fc.nzc_probs_8x8[h][g][i], branch_ct8x8,
-          default_nzc_counts_8x8[h][g][i], 0);
-      }
-      for (i = 0; i < BLOCK_TYPES; ++i) {
-        vp9_tree_probs_from_distribution(
-          vp9_nzc16x16_tree,
-          pc->fc.nzc_probs_16x16[h][g][i], branch_ct16x16,
-          default_nzc_counts_16x16[h][g][i], 0);
-      }
-      for (i = 0; i < BLOCK_TYPES; ++i) {
-        vp9_tree_probs_from_distribution(
-          vp9_nzc32x32_tree,
-          pc->fc.nzc_probs_32x32[h][g][i], branch_ct32x32,
-          default_nzc_counts_32x32[h][g][i], 0);
-      }
-    }
-  }
-#else
-  vpx_memcpy(pc->fc.nzc_probs_4x4, default_nzc_probs_4x4,
-             sizeof(pc->fc.nzc_probs_4x4));
-  vpx_memcpy(pc->fc.nzc_probs_8x8, default_nzc_probs_8x8,
-             sizeof(pc->fc.nzc_probs_8x8));
-  vpx_memcpy(pc->fc.nzc_probs_16x16, default_nzc_probs_16x16,
-             sizeof(pc->fc.nzc_probs_16x16));
-  vpx_memcpy(pc->fc.nzc_probs_32x32, default_nzc_probs_32x32,
-             sizeof(pc->fc.nzc_probs_32x32));
-#endif
-  vpx_memcpy(pc->fc.nzc_pcat_probs, default_nzc_pcat_probs,
-             sizeof(pc->fc.nzc_pcat_probs));
-#endif  // CONFIG_CODE_NONZEROCOUNT
-#if CONFIG_MODELCOEFPROB
-  for (b = 0; b < BLOCK_TYPES; ++b)
-    for (r = 0; r < REF_TYPES; ++r)
-      for (c = 0; c < COEF_BANDS; ++c)
-        for (p = 0; p < PREV_COEF_CONTEXTS; ++p) {
-          int t;
-          for (t = 0; t < UNCONSTRAINED_NODES; t++)
-            pc->fc.coef_probs_4x4[b][r][c][p][t] =
-                default_coef_probs_4x4[b][r][c][p][t];
-          vp9_get_model_distribution(
-              default_coef_probs_4x4[b][r][c][p][UNCONSTRAINED_NODES - 1],
-              pc->fc.coef_probs_4x4[b][r][c][p], b, r);
-          for (t = 0; t < UNCONSTRAINED_NODES; t++)
-            pc->fc.coef_probs_8x8[b][r][c][p][t] =
-                default_coef_probs_8x8[b][r][c][p][t];
-          vp9_get_model_distribution(
-              default_coef_probs_8x8[b][r][c][p][UNCONSTRAINED_NODES - 1],
-              pc->fc.coef_probs_8x8[b][r][c][p], b, r);
-          for (t = 0; t < UNCONSTRAINED_NODES; t++)
-            pc->fc.coef_probs_16x16[b][r][c][p][t] =
-                default_coef_probs_16x16[b][r][c][p][t];
-          vp9_get_model_distribution(
-              default_coef_probs_16x16[b][r][c][p][UNCONSTRAINED_NODES - 1],
-              pc->fc.coef_probs_16x16[b][r][c][p], b, r);
-          for (t = 0; t < UNCONSTRAINED_NODES; t++)
-            pc->fc.coef_probs_32x32[b][r][c][p][t] =
-                default_coef_probs_32x32[b][r][c][p][t];
-          vp9_get_model_distribution(
-              default_coef_probs_32x32[b][r][c][p][UNCONSTRAINED_NODES - 1],
-              pc->fc.coef_probs_32x32[b][r][c][p], b, r);
-        }
-#else
-  vpx_memcpy(pc->fc.coef_probs_4x4, default_coef_probs_4x4,
-             sizeof(pc->fc.coef_probs_4x4));
-  vpx_memcpy(pc->fc.coef_probs_8x8, default_coef_probs_8x8,
-             sizeof(pc->fc.coef_probs_8x8));
-  vpx_memcpy(pc->fc.coef_probs_16x16, default_coef_probs_16x16,
-             sizeof(pc->fc.coef_probs_16x16));
-  vpx_memcpy(pc->fc.coef_probs_32x32, default_coef_probs_32x32,
-             sizeof(pc->fc.coef_probs_32x32));
-#endif
+  vpx_memcpy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4,
+             sizeof(pc->fc.coef_probs[TX_4X4]));
+  vpx_memcpy(pc->fc.coef_probs[TX_8X8], default_coef_probs_8x8,
+             sizeof(pc->fc.coef_probs[TX_8X8]));
+  vpx_memcpy(pc->fc.coef_probs[TX_16X16], default_coef_probs_16x16,
+             sizeof(pc->fc.coef_probs[TX_16X16]));
+  vpx_memcpy(pc->fc.coef_probs[TX_32X32], default_coef_probs_32x32,
+             sizeof(pc->fc.coef_probs[TX_32X32]));
 }
 
-#if CONFIG_MODELCOEFPROB
-// This is a placeholder function that will enable the default coef probs to
-// change for key frames based on the base_qindex. If base_qindex is large,
-// we can expect probabilities of zeros to be bigger, and vice versa. The rest
-// of the probabilities are derived from the nodel.
-void vp9_adjust_default_coef_probs(VP9_COMMON *cm) {
-  static const int factor_bits = 4;
-  static const int factor_rnd = 8;   // (1 << (factor_bits - 1))
-  int b, r, c, p;
-  int factor = (1 << factor_bits);
-  /*
-  if (cm->base_qindex < 32)
-    factor -= ((32 - cm->base_qindex) >> 4);
-    */
-  if (cm->base_qindex > 128)
-    factor += ((cm->base_qindex - 128) >> 4);
-  // printf(" Q %d factor %d\n", cm->base_qindex, factor);
-
-  for (b = 0; b < BLOCK_TYPES; ++b)
-    for (r = 0; r < REF_TYPES; ++r)
-      for (c = 0; c < COEF_BANDS; ++c)
-        for (p = 0; p < PREV_COEF_CONTEXTS; ++p) {
-          int t, x;
-          vp9_prob prob;
-          for (t = 0; t < UNCONSTRAINED_NODES; t++) {
-            x = (default_coef_probs_4x4[b][r][c][p][t] * factor + factor_rnd)
-                >> factor_bits;
-            prob = (x > 255 ? 255 : (x < 1 ? 1 : x));
-            cm->fc.coef_probs_4x4[b][r][c][p][t] = prob;
-          }
-          vp9_get_model_distribution(
-              prob, cm->fc.coef_probs_4x4[b][r][c][p], b, r);
-          for (t = 0; t < UNCONSTRAINED_NODES; t++) {
-            x = (default_coef_probs_8x8[b][r][c][p][t] * factor + factor_rnd)
-                >> factor_bits;
-            prob = (x > 255 ? 255 : (x < 1 ? 1 : x));
-            cm->fc.coef_probs_8x8[b][r][c][p][t] = prob;
-          }
-          vp9_get_model_distribution(
-              prob, cm->fc.coef_probs_8x8[b][r][c][p], b, r);
-          for (t = 0; t < UNCONSTRAINED_NODES; t++) {
-            x = (default_coef_probs_16x16[b][r][c][p][t] * factor + factor_rnd)
-                >> factor_bits;
-            prob = (x > 255 ? 255 : (x < 1 ? 1 : x));
-            cm->fc.coef_probs_16x16[b][r][c][p][t] = prob;
-          }
-          vp9_get_model_distribution(
-              prob, cm->fc.coef_probs_16x16[b][r][c][p], b, r);
-          for (t = 0; t < UNCONSTRAINED_NODES; t++) {
-            x = (default_coef_probs_32x32[b][r][c][p][t] * factor + factor_rnd)
-                >> factor_bits;
-            prob = (x > 255 ? 255 : (x < 1 ? 1 : x));
-            cm->fc.coef_probs_32x32[b][r][c][p][t] = prob;
-          }
-          vp9_get_model_distribution(
-              prob, cm->fc.coef_probs_32x32[b][r][c][p], b, r);
-        }
-}
-#endif
-
 // Neighborhood 5-tuples for various scans and blocksizes,
 // in {top, left, topleft, topright, bottomleft} order
 // for each position in raster scan order.
 // -1 indicates the neighbor does not exist.
 DECLARE_ALIGNED(16, int,
-                vp9_default_zig_zag1d_4x4_neighbors[16 * MAX_NEIGHBORS]);
+                vp9_default_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
 DECLARE_ALIGNED(16, int,
                 vp9_col_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
 DECLARE_ALIGNED(16, int,
@@ -2291,15 +508,15 @@
 DECLARE_ALIGNED(16, int,
                 vp9_row_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);
 DECLARE_ALIGNED(16, int,
-                vp9_default_zig_zag1d_8x8_neighbors[64 * MAX_NEIGHBORS]);
+                vp9_default_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);
 DECLARE_ALIGNED(16, int,
                 vp9_col_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
 DECLARE_ALIGNED(16, int,
                 vp9_row_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
 DECLARE_ALIGNED(16, int,
-                vp9_default_zig_zag1d_16x16_neighbors[256 * MAX_NEIGHBORS]);
+                vp9_default_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);
 DECLARE_ALIGNED(16, int,
-                vp9_default_zig_zag1d_32x32_neighbors[1024 * MAX_NEIGHBORS]);
+                vp9_default_scan_32x32_neighbors[1024 * MAX_NEIGHBORS]);
 
 static int find_in_scan(const int *scan, int l, int idx) {
   int n, l2 = l * l;
@@ -2361,32 +578,32 @@
 }
 
 void vp9_init_neighbors() {
-  init_scan_neighbors(vp9_default_zig_zag1d_4x4, 4,
-                      vp9_default_zig_zag1d_4x4_neighbors, MAX_NEIGHBORS);
+  init_scan_neighbors(vp9_default_scan_4x4, 4,
+                      vp9_default_scan_4x4_neighbors, MAX_NEIGHBORS);
   init_scan_neighbors(vp9_row_scan_4x4, 4,
                       vp9_row_scan_4x4_neighbors, MAX_NEIGHBORS);
   init_scan_neighbors(vp9_col_scan_4x4, 4,
                       vp9_col_scan_4x4_neighbors, MAX_NEIGHBORS);
-  init_scan_neighbors(vp9_default_zig_zag1d_8x8, 8,
-                      vp9_default_zig_zag1d_8x8_neighbors, MAX_NEIGHBORS);
+  init_scan_neighbors(vp9_default_scan_8x8, 8,
+                      vp9_default_scan_8x8_neighbors, MAX_NEIGHBORS);
   init_scan_neighbors(vp9_row_scan_8x8, 8,
                       vp9_row_scan_8x8_neighbors, MAX_NEIGHBORS);
   init_scan_neighbors(vp9_col_scan_8x8, 8,
                       vp9_col_scan_8x8_neighbors, MAX_NEIGHBORS);
-  init_scan_neighbors(vp9_default_zig_zag1d_16x16, 16,
-                      vp9_default_zig_zag1d_16x16_neighbors, MAX_NEIGHBORS);
+  init_scan_neighbors(vp9_default_scan_16x16, 16,
+                      vp9_default_scan_16x16_neighbors, MAX_NEIGHBORS);
   init_scan_neighbors(vp9_row_scan_16x16, 16,
                       vp9_row_scan_16x16_neighbors, MAX_NEIGHBORS);
   init_scan_neighbors(vp9_col_scan_16x16, 16,
                       vp9_col_scan_16x16_neighbors, MAX_NEIGHBORS);
-  init_scan_neighbors(vp9_default_zig_zag1d_32x32, 32,
-                      vp9_default_zig_zag1d_32x32_neighbors, MAX_NEIGHBORS);
+  init_scan_neighbors(vp9_default_scan_32x32, 32,
+                      vp9_default_scan_32x32_neighbors, MAX_NEIGHBORS);
 }
 
 const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad) {
-  if (scan == vp9_default_zig_zag1d_4x4) {
+  if (scan == vp9_default_scan_4x4) {
     *pad = MAX_NEIGHBORS;
-    return vp9_default_zig_zag1d_4x4_neighbors;
+    return vp9_default_scan_4x4_neighbors;
   } else if (scan == vp9_row_scan_4x4) {
     *pad = MAX_NEIGHBORS;
     return vp9_row_scan_4x4_neighbors;
@@ -2393,9 +610,9 @@
   } else if (scan == vp9_col_scan_4x4) {
     *pad = MAX_NEIGHBORS;
     return vp9_col_scan_4x4_neighbors;
-  } else if (scan == vp9_default_zig_zag1d_8x8) {
+  } else if (scan == vp9_default_scan_8x8) {
     *pad = MAX_NEIGHBORS;
-    return vp9_default_zig_zag1d_8x8_neighbors;
+    return vp9_default_scan_8x8_neighbors;
   } else if (scan == vp9_row_scan_8x8) {
     *pad = 2;
     return vp9_row_scan_8x8_neighbors;
@@ -2402,9 +619,9 @@
   } else if (scan == vp9_col_scan_8x8) {
     *pad = 2;
     return vp9_col_scan_8x8_neighbors;
-  } else if (scan == vp9_default_zig_zag1d_16x16) {
+  } else if (scan == vp9_default_scan_16x16) {
     *pad = MAX_NEIGHBORS;
-    return vp9_default_zig_zag1d_16x16_neighbors;
+    return vp9_default_scan_16x16_neighbors;
   } else if (scan == vp9_row_scan_16x16) {
     *pad = 2;
     return vp9_row_scan_16x16_neighbors;
@@ -2411,9 +628,9 @@
   } else if (scan == vp9_col_scan_16x16) {
     *pad = 2;
     return vp9_col_scan_16x16_neighbors;
-  } else if (scan == vp9_default_zig_zag1d_32x32) {
+  } else if (scan == vp9_default_scan_32x32) {
     *pad = MAX_NEIGHBORS;
-    return vp9_default_zig_zag1d_32x32_neighbors;
+    return vp9_default_scan_32x32_neighbors;
   } else {
     assert(0);
     return NULL;
@@ -2424,1098 +641,8 @@
   vp9_init_neighbors();
   init_bit_trees();
   vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_tokens_from_tree(vp9_nzc4x4_encodings, vp9_nzc4x4_tree);
-  vp9_tokens_from_tree(vp9_nzc8x8_encodings, vp9_nzc8x8_tree);
-  vp9_tokens_from_tree(vp9_nzc16x16_encodings, vp9_nzc16x16_tree);
-  vp9_tokens_from_tree(vp9_nzc32x32_encodings, vp9_nzc32x32_tree);
-#endif
 }
 
-#if CONFIG_CODE_NONZEROCOUNT
-
-#define mb_in_cur_tile(cm, mb_row, mb_col)      \
-    ((mb_col) >= (cm)->cur_tile_mb_col_start && \
-     (mb_col) <= (cm)->cur_tile_mb_col_end   && \
-     (mb_row) >= 0)
-
-#define choose_nzc_context(nzc_exp, t2, t1)     \
-    ((nzc_exp) >= (t2) ? 2 : (nzc_exp) >= (t1) ? 1 : 0)
-
-#define NZC_T2_32X32    (16 << 6)
-#define NZC_T1_32X32     (4 << 6)
-
-#define NZC_T2_16X16    (12 << 6)
-#define NZC_T1_16X16     (3 << 6)
-
-#define NZC_T2_8X8       (8 << 6)
-#define NZC_T1_8X8       (2 << 6)
-
-#define NZC_T2_4X4       (4 << 6)
-#define NZC_T1_4X4       (1 << 6)
-
-// Transforms a mb16 block index to a sb64 block index
-static inline int mb16_to_sb64_index(int mb_row, int mb_col, int block) {
-  int r = (mb_row & 3);
-  int c = (mb_col & 3);
-  int b;
-  if (block < 16) {  // Y
-    int ib = block >> 2;
-    int jb = block & 3;
-    ib += r * 4;
-    jb += c * 4;
-    b = ib * 16 + jb;
-    assert(b < 256);
-    return b;
-  } else {  // UV
-    int base = block - (block & 3);
-    int ib = (block - base) >> 1;
-    int jb = (block - base) & 1;
-    ib += r * 2;
-    jb += c * 2;
-    b = base * 16 + ib * 8 + jb;
-    assert(b >= 256 && b < 384);
-    return b;
-  }
-}
-
-// Transforms a mb16 block index to a sb32 block index
-static inline int mb16_to_sb32_index(int mb_row, int mb_col, int block) {
-  int r = (mb_row & 1);
-  int c = (mb_col & 1);
-  int b;
-  if (block < 16) {  // Y
-    int ib = block >> 2;
-    int jb = block & 3;
-    ib += r * 4;
-    jb += c * 4;
-    b = ib * 8 + jb;
-    assert(b < 64);
-    return b;
-  } else {  // UV
-    int base = block - (block & 3);
-    int ib = (block - base) >> 1;
-    int jb = (block - base) & 1;
-    ib += r * 2;
-    jb += c * 2;
-    b = base * 4 + ib * 4 + jb;
-    assert(b >= 64 && b < 96);
-    return b;
-  }
-}
-
-static inline int block_to_txfm_index(int block, TX_SIZE tx_size, int s) {
-  // s is the log of the number of 4x4 blocks in each row/col of larger block
-  int b, ib, jb, nb;
-  ib = block >> s;
-  jb = block - (ib << s);
-  ib >>= tx_size;
-  jb >>= tx_size;
-  nb = 1 << (s - tx_size);
-  b = (ib * nb + jb) << (2 * tx_size);
-  return b;
-}
-
-/* BEGIN - Helper functions to get the y nzcs */
-static unsigned int get_nzc_4x4_y_sb64(MB_MODE_INFO *mi, int block) {
-  int b;
-  assert(block < 256);
-  b = block_to_txfm_index(block, mi->txfm_size, 4);
-  assert(b < 256);
-  return mi->nzcs[b] << (6 - 2 * mi->txfm_size);
-}
-
-static unsigned int get_nzc_4x4_y_sb32(MB_MODE_INFO *mi, int block) {
-  int b;
-  assert(block < 64);
-  b = block_to_txfm_index(block, mi->txfm_size, 3);
-  assert(b < 64);
-  return mi->nzcs[b] << (6 - 2 * mi->txfm_size);
-}
-
-static unsigned int get_nzc_4x4_y_mb16(MB_MODE_INFO *mi, int block) {
-  int b;
-  assert(block < 16);
-  b = block_to_txfm_index(block, mi->txfm_size, 2);
-  assert(b < 16);
-  return mi->nzcs[b] << (6 - 2 * mi->txfm_size);
-}
-/* END - Helper functions to get the y nzcs */
-
-/* Function to get y nzc where block index is in mb16 terms */
-static unsigned int get_nzc_4x4_y(VP9_COMMON *cm, MODE_INFO *m,
-                                  int mb_row, int mb_col, int block) {
-  // NOTE: All values returned are at 64 times the true value at 4x4 scale
-  MB_MODE_INFO *const mi = &m->mbmi;
-  const int mis = cm->mode_info_stride;
-  if (mi->mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col))
-    return 0;
-  if (mi->sb_type == BLOCK_SIZE_SB64X64) {
-    int r = mb_row & 3;
-    int c = mb_col & 3;
-    m -= c + r * mis;
-    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))
-      return 0;
-    else
-      return get_nzc_4x4_y_sb64(
-          &m->mbmi, mb16_to_sb64_index(mb_row, mb_col, block));
-  } else if (mi->sb_type == BLOCK_SIZE_SB32X32) {
-    int r = mb_row & 1;
-    int c = mb_col & 1;
-    m -= c + r * mis;
-    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))
-      return 0;
-    else
-      return get_nzc_4x4_y_sb32(
-          &m->mbmi, mb16_to_sb32_index(mb_row, mb_col, block));
-  } else {
-    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col))
-      return 0;
-    return get_nzc_4x4_y_mb16(mi, block);
-  }
-}
-
-/* BEGIN - Helper functions to get the uv nzcs */
-static unsigned int get_nzc_4x4_uv_sb64(MB_MODE_INFO *mi, int block) {
-  int b;
-  int base, uvtxfm_size;
-  assert(block >= 256 && block < 384);
-  uvtxfm_size = mi->txfm_size;
-  base = 256 + (block & 64);
-  block -= base;
-  b = base + block_to_txfm_index(block, uvtxfm_size, 3);
-  assert(b >= 256 && b < 384);
-  return mi->nzcs[b] << (6 - 2 * uvtxfm_size);
-}
-
-static unsigned int get_nzc_4x4_uv_sb32(MB_MODE_INFO *mi, int block) {
-  int b;
-  int base, uvtxfm_size;
-  assert(block >= 64 && block < 96);
-  if (mi->txfm_size == TX_32X32)
-    uvtxfm_size = TX_16X16;
-  else
-    uvtxfm_size = mi->txfm_size;
-  base = 64 + (block & 16);
-  block -= base;
-  b = base + block_to_txfm_index(block, uvtxfm_size, 2);
-  assert(b >= 64 && b < 96);
-  return mi->nzcs[b] << (6 - 2 * uvtxfm_size);
-}
-
-static unsigned int get_nzc_4x4_uv_mb16(MB_MODE_INFO *mi, int block) {
-  int b;
-  int base, uvtxfm_size;
-  assert(block >= 16 && block < 24);
-  if (mi->txfm_size == TX_8X8 &&
-      (mi->mode == SPLITMV || mi->mode == I8X8_PRED))
-    uvtxfm_size = TX_4X4;
-  else if (mi->txfm_size == TX_16X16)
-    uvtxfm_size = TX_8X8;
-  else
-    uvtxfm_size = mi->txfm_size;
-  base = 16 + (block & 4);
-  block -= base;
-  b = base + block_to_txfm_index(block, uvtxfm_size, 1);
-  assert(b >= 16 && b < 24);
-  return mi->nzcs[b] << (6 - 2 * uvtxfm_size);
-}
-/* END - Helper functions to get the uv nzcs */
-
-/* Function to get uv nzc where block index is in mb16 terms */
-static unsigned int get_nzc_4x4_uv(VP9_COMMON *cm, MODE_INFO *m,
-                                   int mb_row, int mb_col, int block) {
-  // NOTE: All values returned are at 64 times the true value at 4x4 scale
-  MB_MODE_INFO *const mi = &m->mbmi;
-  const int mis = cm->mode_info_stride;
-  if (mi->mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col))
-    return 0;
-  if (mi->sb_type == BLOCK_SIZE_SB64X64) {
-    int r = mb_row & 3;
-    int c = mb_col & 3;
-    m -= c + r * mis;
-    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))
-      return 0;
-    else
-      return get_nzc_4x4_uv_sb64(
-          &m->mbmi, mb16_to_sb64_index(mb_row, mb_col, block));
-  } else if (mi->sb_type == BLOCK_SIZE_SB32X32) {
-    int r = mb_row & 1;
-    int c = mb_col & 1;
-    m -= c + r * mis;
-    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))
-      return 0;
-    else
-    return get_nzc_4x4_uv_sb32(
-        &m->mbmi, mb16_to_sb32_index(mb_row, mb_col, block));
-  } else {
-    return get_nzc_4x4_uv_mb16(mi, block);
-  }
-}
-
-int vp9_get_nzc_context_y_sb64(VP9_COMMON *cm, MODE_INFO *cur,
-                               int mb_row, int mb_col, int block) {
-  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
-  // neighboring blocks are
-  int mis = cm->mode_info_stride;
-  int nzc_exp = 0;
-  TX_SIZE txfm_size = cur->mbmi.txfm_size;
-  assert(block < 256);
-  switch (txfm_size) {
-    case TX_32X32:
-      assert((block & 63) == 0);
-      if (block < 128) {
-        int o = (block >> 6) * 2;
-        nzc_exp =
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15) +
-            get_nzc_4x4_y(cm, cur - mis + o + 1,
-                          mb_row - 1, mb_col + o + 1, 12) +
-            get_nzc_4x4_y(cm, cur - mis + o + 1,
-                          mb_row - 1, mb_col + o + 1, 13) +
-            get_nzc_4x4_y(cm, cur - mis + o + 1,
-                          mb_row - 1, mb_col + o + 1, 14) +
-            get_nzc_4x4_y(cm, cur - mis + o + 1,
-                          mb_row - 1, mb_col + o + 1, 15);
-      } else {
-        nzc_exp = cur->mbmi.nzcs[block - 128] << 3;
-      }
-      if ((block & 127) == 0) {
-        int o = (block >> 7) * 2;
-        nzc_exp +=
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,
-                          mb_row + o + 1, mb_col - 1, 3) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,
-                          mb_row + o + 1, mb_col - 1, 7) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,
-                          mb_row + o + 1, mb_col - 1, 11) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,
-                          mb_row + o + 1, mb_col - 1, 15);
-      } else {
-        nzc_exp += cur->mbmi.nzcs[block - 64] << 3;
-      }
-      nzc_exp <<= 2;
-      // Note nzc_exp is 64 times the average value expected at 32x32 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32);
-      break;
-
-    case TX_16X16:
-      assert((block & 15) == 0);
-      if (block < 64) {
-        int o = block >> 4;
-        nzc_exp =
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15);
-      } else {
-        nzc_exp = cur->mbmi.nzcs[block - 64] << 4;
-      }
-      if ((block & 63) == 0) {
-        int o = block >> 6;
-        nzc_exp +=
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15);
-      } else {
-        nzc_exp += cur->mbmi.nzcs[block - 16] << 4;
-      }
-      nzc_exp <<= 1;
-      // Note nzc_exp is 64 times the average value expected at 16x16 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);
-      break;
-
-    case TX_8X8:
-      assert((block & 3) == 0);
-      if (block < 32) {
-        int o = block >> 3;
-        int p = ((block >> 2) & 1) ? 14 : 12;
-        nzc_exp =
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p + 1);
-      } else {
-        nzc_exp = cur->mbmi.nzcs[block - 32] << 5;
-      }
-      if ((block & 31) == 0) {
-        int o = block >> 6;
-        int p = ((block >> 5) & 1) ? 11 : 3;
-        nzc_exp +=
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p + 4);
-      } else {
-        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;
-      }
-      // Note nzc_exp is 64 times the average value expected at 8x8 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
-      break;
-
-    case TX_4X4:
-      if (block < 16) {
-        int o = block >> 2;
-        int p = block & 3;
-        nzc_exp = get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o,
-                                12 + p);
-      } else {
-        nzc_exp = (cur->mbmi.nzcs[block - 16] << 6);
-      }
-      if ((block & 15) == 0) {
-        int o = block >> 6;
-        int p = (block >> 4) & 3;
-        nzc_exp += get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
-                                 3 + 4 * p);
-      } else {
-        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
-      }
-      nzc_exp >>= 1;
-      // Note nzc_exp is 64 times the average value expected at 4x4 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
-      break;
-
-    default:
-      return 0;
-  }
-}
-
-int vp9_get_nzc_context_y_sb32(VP9_COMMON *cm, MODE_INFO *cur,
-                               int mb_row, int mb_col, int block) {
-  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
-  // neighboring blocks are
-  int mis = cm->mode_info_stride;
-  int nzc_exp = 0;
-  TX_SIZE txfm_size = cur->mbmi.txfm_size;
-  assert(block < 64);
-  switch (txfm_size) {
-    case TX_32X32:
-      assert(block == 0);
-      nzc_exp =
-          (get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 12) +
-           get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 13) +
-           get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 14) +
-           get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 15) +
-           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 12) +
-           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 13) +
-           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 14) +
-           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 15) +
-           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 3) +
-           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 7) +
-           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 11) +
-           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 15) +
-           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 3) +
-           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 7) +
-           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 11) +
-           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 15)) << 2;
-      // Note nzc_exp is 64 times the average value expected at 32x32 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32);
-      break;
-
-    case TX_16X16:
-      assert((block & 15) == 0);
-      if (block < 32) {
-        int o = (block >> 4) & 1;
-        nzc_exp =
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15);
-      } else {
-        nzc_exp = cur->mbmi.nzcs[block - 32] << 4;
-      }
-      if ((block & 31) == 0) {
-        int o = block >> 5;
-        nzc_exp +=
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15);
-      } else {
-        nzc_exp += cur->mbmi.nzcs[block - 16] << 4;
-      }
-      nzc_exp <<= 1;
-      // Note nzc_exp is 64 times the average value expected at 16x16 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);
-      break;
-
-    case TX_8X8:
-      assert((block & 3) == 0);
-      if (block < 16) {
-        int o = block >> 3;
-        int p = ((block >> 2) & 1) ? 14 : 12;
-        nzc_exp =
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p) +
-            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p + 1);
-      } else {
-        nzc_exp = cur->mbmi.nzcs[block - 16] << 5;
-      }
-      if ((block & 15) == 0) {
-        int o = block >> 5;
-        int p = ((block >> 4) & 1) ? 11 : 3;
-        nzc_exp +=
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p) +
-            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p + 4);
-      } else {
-        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;
-      }
-      // Note nzc_exp is 64 times the average value expected at 8x8 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
-      break;
-
-    case TX_4X4:
-      if (block < 8) {
-        int o = block >> 2;
-        int p = block & 3;
-        nzc_exp = get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o,
-                                12 + p);
-      } else {
-        nzc_exp = (cur->mbmi.nzcs[block - 8] << 6);
-      }
-      if ((block & 7) == 0) {
-        int o = block >> 5;
-        int p = (block >> 3) & 3;
-        nzc_exp += get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
-                                 3 + 4 * p);
-      } else {
-        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
-      }
-      nzc_exp >>= 1;
-      // Note nzc_exp is 64 times the average value expected at 4x4 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
-      break;
-
-    default:
-      return 0;
-      break;
-  }
-}
-
-int vp9_get_nzc_context_y_mb16(VP9_COMMON *cm, MODE_INFO *cur,
-                               int mb_row, int mb_col, int block) {
-  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
-  // neighboring blocks are
-  int mis = cm->mode_info_stride;
-  int nzc_exp = 0;
-  TX_SIZE txfm_size = cur->mbmi.txfm_size;
-  assert(block < 16);
-  switch (txfm_size) {
-    case TX_16X16:
-      assert(block == 0);
-      nzc_exp =
-          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 12) +
-          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 13) +
-          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 14) +
-          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 15) +
-          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 3) +
-          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 7) +
-          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 11) +
-          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 15);
-      nzc_exp <<= 1;
-      // Note nzc_exp is 64 times the average value expected at 16x16 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);
-
-    case TX_8X8:
-      assert((block & 3) == 0);
-      if (block < 8) {
-        int p = ((block >> 2) & 1) ? 14 : 12;
-        nzc_exp =
-            get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, p) +
-            get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, p + 1);
-      } else {
-        nzc_exp = cur->mbmi.nzcs[block - 8] << 5;
-      }
-      if ((block & 7) == 0) {
-        int p = ((block >> 3) & 1) ? 11 : 3;
-        nzc_exp +=
-            get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, p) +
-            get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, p + 4);
-      } else {
-        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;
-      }
-      // Note nzc_exp is 64 times the average value expected at 8x8 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
-
-    case TX_4X4:
-      if (block < 4) {
-        int p = block & 3;
-        nzc_exp = get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col,
-                                12 + p);
-      } else {
-        nzc_exp = (cur->mbmi.nzcs[block - 4] << 6);
-      }
-      if ((block & 3) == 0) {
-        int p = (block >> 2) & 3;
-        nzc_exp += get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1,
-                                 3 + 4 * p);
-      } else {
-        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
-      }
-      nzc_exp >>= 1;
-      // Note nzc_exp is 64 times the average value expected at 4x4 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
-
-    default:
-      return 0;
-      break;
-  }
-}
-
-int vp9_get_nzc_context_uv_sb64(VP9_COMMON *cm, MODE_INFO *cur,
-                                int mb_row, int mb_col, int block) {
-  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
-  // neighboring blocks are
-  int mis = cm->mode_info_stride;
-  int nzc_exp = 0;
-  const int base = block - (block & 63);
-  const int boff = (block & 63);
-  const int base_mb16 = base >> 4;
-  TX_SIZE txfm_size = cur->mbmi.txfm_size;
-  TX_SIZE txfm_size_uv;
-
-  assert(block >= 256 && block < 384);
-  txfm_size_uv = txfm_size;
-
-  switch (txfm_size_uv) {
-    case TX_32X32:
-      assert(block == 256 || block == 320);
-      nzc_exp =
-          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,
-                         base_mb16 + 2) +
-          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,
-                         base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,
-                         base_mb16 + 2) +
-          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,
-                         base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - mis + 2, mb_row - 1, mb_col + 2,
-                         base_mb16 + 2) +
-          get_nzc_4x4_uv(cm, cur - mis + 2, mb_row - 1, mb_col + 2,
-                         base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - mis + 3, mb_row - 1, mb_col + 3,
-                         base_mb16 + 2) +
-          get_nzc_4x4_uv(cm, cur - mis + 3, mb_row - 1, mb_col + 3,
-                         base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1,
-                         base_mb16 + 1) +
-          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1,
-                         base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,
-                         base_mb16 + 1) +
-          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,
-                         base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - 1 + 2 * mis, mb_row + 2, mb_col - 1,
-                         base_mb16 + 1) +
-          get_nzc_4x4_uv(cm, cur - 1 + 2 * mis, mb_row + 2, mb_col - 1,
-                         base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - 1 + 3 * mis, mb_row + 3, mb_col - 1,
-                         base_mb16 + 1) +
-          get_nzc_4x4_uv(cm, cur - 1 + 3 * mis, mb_row + 3, mb_col - 1,
-                         base_mb16 + 3);
-      nzc_exp <<= 2;
-      // Note nzc_exp is 64 times the average value expected at 32x32 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32);
-
-    case TX_16X16:
-      // uv txfm_size 16x16
-      assert((block & 15) == 0);
-      if (boff < 32) {
-        int o = (boff >> 4) & 1;
-        nzc_exp =
-            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
-                           base_mb16 + 2) +
-            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
-                           base_mb16 + 3) +
-            get_nzc_4x4_uv(cm, cur - mis + o + 1, mb_row - 1, mb_col + o + 1,
-                           base_mb16 + 2) +
-            get_nzc_4x4_uv(cm, cur - mis + o + 1, mb_row - 1, mb_col + o + 1,
-                           base_mb16 + 3);
-      } else {
-        nzc_exp = cur->mbmi.nzcs[block - 32] << 4;
-      }
-      if ((boff & 31) == 0) {
-        int o = boff >> 5;
-        nzc_exp +=
-            get_nzc_4x4_uv(cm, cur - 1 + o * mis,
-                           mb_row + o, mb_col - 1, base_mb16 + 1) +
-            get_nzc_4x4_uv(cm, cur - 1 + o * mis,
-                           mb_row + o, mb_col - 1, base_mb16 + 3) +
-            get_nzc_4x4_uv(cm, cur - 1 + o * mis + mis,
-                           mb_row + o + 1, mb_col - 1, base_mb16 + 1) +
-            get_nzc_4x4_uv(cm, cur - 1 + o * mis + mis,
-                           mb_row + o + 1, mb_col - 1, base_mb16 + 3);
-      } else {
-        nzc_exp += cur->mbmi.nzcs[block - 16] << 4;
-      }
-      nzc_exp <<= 1;
-      // Note nzc_exp is 64 times the average value expected at 16x16 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);
-
-    case TX_8X8:
-      assert((block & 3) == 0);
-      if (boff < 16) {
-        int o = boff >> 2;
-        nzc_exp =
-            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
-                           base_mb16 + 2) +
-            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
-                           base_mb16 + 3);
-      } else {
-        nzc_exp = cur->mbmi.nzcs[block - 16] << 5;
-      }
-      if ((boff & 15) == 0) {
-        int o = boff >> 4;
-        nzc_exp +=
-            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
-                           base_mb16 + 1) +
-            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
-                           base_mb16 + 3);
-      } else {
-        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;
-      }
-      // Note nzc_exp is 64 times the average value expected at 8x8 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
-
-    case TX_4X4:
-      if (boff < 8) {
-        int o = boff >> 1;
-        int p = boff & 1;
-        nzc_exp = get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
-                                 base_mb16 + 2 + p);
-      } else {
-        nzc_exp = (cur->mbmi.nzcs[block - 8] << 6);
-      }
-      if ((boff & 7) == 0) {
-        int o = boff >> 4;
-        int p = (boff >> 3) & 1;
-        nzc_exp += get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
-                                  base_mb16 + 1 + 2 * p);
-      } else {
-        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
-      }
-      nzc_exp >>= 1;
-      // Note nzc_exp is 64 times the average value expected at 4x4 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
-
-    default:
-      return 0;
-  }
-}
-
-int vp9_get_nzc_context_uv_sb32(VP9_COMMON *cm, MODE_INFO *cur,
-                                int mb_row, int mb_col, int block) {
-  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
-  // neighboring blocks are
-  int mis = cm->mode_info_stride;
-  int nzc_exp = 0;
-  const int base = block - (block & 15);
-  const int boff = (block & 15);
-  const int base_mb16 = base >> 2;
-  TX_SIZE txfm_size = cur->mbmi.txfm_size;
-  TX_SIZE txfm_size_uv;
-
-  assert(block >= 64 && block < 96);
-  if (txfm_size == TX_32X32)
-    txfm_size_uv = TX_16X16;
-  else
-    txfm_size_uv = txfm_size;
-
-  switch (txfm_size_uv) {
-    case TX_16X16:
-      // uv txfm_size 16x16
-      assert(block == 64 || block == 80);
-      nzc_exp =
-          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,
-                         base_mb16 + 2) +
-          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,
-                         base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,
-                         base_mb16 + 2) +
-          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,
-                         base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row, mb_col - 1,
-                         base_mb16 + 1) +
-          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row, mb_col - 1,
-                         base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,
-                         base_mb16 + 1) +
-          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,
-                         base_mb16 + 3);
-      nzc_exp <<= 1;
-      // Note nzc_exp is 64 times the average value expected at 16x16 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);
-      break;
-
-    case TX_8X8:
-      assert((block & 3) == 0);
-      if (boff < 8) {
-        int o = boff >> 2;
-        nzc_exp =
-            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
-                           base_mb16 + 2) +
-            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
-                           base_mb16 + 3);
-      } else {
-        nzc_exp = cur->mbmi.nzcs[block - 8] << 5;
-      }
-      if ((boff & 7) == 0) {
-        int o = boff >> 3;
-        nzc_exp +=
-            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
-                           base_mb16 + 1) +
-            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
-                           base_mb16 + 3);
-      } else {
-        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;
-      }
-      // Note nzc_exp is 64 times the average value expected at 8x8 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
-
-    case TX_4X4:
-      if (boff < 4) {
-        int o = boff >> 1;
-        int p = boff & 1;
-        nzc_exp = get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,
-                                 base_mb16 + 2 + p);
-      } else {
-        nzc_exp = (cur->mbmi.nzcs[block - 4] << 6);
-      }
-      if ((boff & 3) == 0) {
-        int o = boff >> 3;
-        int p = (boff >> 2) & 1;
-        nzc_exp += get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,
-                                  base_mb16 + 1 + 2 * p);
-      } else {
-        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
-      }
-      nzc_exp >>= 1;
-      // Note nzc_exp is 64 times the average value expected at 4x4 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
-
-    default:
-      return 0;
-  }
-}
-
-int vp9_get_nzc_context_uv_mb16(VP9_COMMON *cm, MODE_INFO *cur,
-                                int mb_row, int mb_col, int block) {
-  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy
-  // neighboring blocks are
-  int mis = cm->mode_info_stride;
-  int nzc_exp = 0;
-  const int base = block - (block & 3);
-  const int boff = (block & 3);
-  const int base_mb16 = base;
-  TX_SIZE txfm_size = cur->mbmi.txfm_size;
-  TX_SIZE txfm_size_uv;
-
-  assert(block >= 16 && block < 24);
-  if (txfm_size == TX_16X16)
-    txfm_size_uv = TX_8X8;
-  else if (txfm_size == TX_8X8 &&
-           (cur->mbmi.mode == I8X8_PRED || cur->mbmi.mode == SPLITMV))
-    txfm_size_uv = TX_4X4;
-  else
-    txfm_size_uv = txfm_size;
-
-  switch (txfm_size_uv) {
-    case TX_8X8:
-      assert((block & 3) == 0);
-      nzc_exp =
-          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col, base_mb16 + 2) +
-          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col, base_mb16 + 3) +
-          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1, base_mb16 + 1) +
-          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1, base_mb16 + 3);
-      // Note nzc_exp is 64 times the average value expected at 8x8 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);
-
-    case TX_4X4:
-      if (boff < 2) {
-        int p = boff & 1;
-        nzc_exp = get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,
-                                 base_mb16 + 2 + p);
-      } else {
-        nzc_exp = (cur->mbmi.nzcs[block - 2] << 6);
-      }
-      if ((boff & 1) == 0) {
-        int p = (boff >> 1) & 1;
-        nzc_exp += get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1,
-                                  base_mb16 + 1 + 2 * p);
-      } else {
-        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);
-      }
-      nzc_exp >>= 1;
-      // Note nzc_exp is 64 times the average value expected at 4x4 scale
-      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);
-
-    default:
-      return 0;
-  }
-}
-
-int vp9_get_nzc_context(VP9_COMMON *cm, MACROBLOCKD *xd, int block) {
-  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
-    assert(block < 384);
-    if (block < 256)
-      return vp9_get_nzc_context_y_sb64(cm, xd->mode_info_context,
-                                        get_mb_row(xd), get_mb_col(xd), block);
-    else
-      return vp9_get_nzc_context_uv_sb64(cm, xd->mode_info_context,
-                                         get_mb_row(xd), get_mb_col(xd), block);
-  } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {
-    assert(block < 96);
-    if (block < 64)
-      return vp9_get_nzc_context_y_sb32(cm, xd->mode_info_context,
-                                        get_mb_row(xd), get_mb_col(xd), block);
-    else
-      return vp9_get_nzc_context_uv_sb32(cm, xd->mode_info_context,
-                                         get_mb_row(xd), get_mb_col(xd), block);
-  } else {
-    assert(block < 64);
-    if (block < 16)
-      return vp9_get_nzc_context_y_mb16(cm, xd->mode_info_context,
-                                        get_mb_row(xd), get_mb_col(xd), block);
-    else
-      return vp9_get_nzc_context_uv_mb16(cm, xd->mode_info_context,
-                                         get_mb_row(xd), get_mb_col(xd), block);
-  }
-}
-
-static void update_nzc(VP9_COMMON *cm,
-                       uint16_t nzc,
-                       int nzc_context,
-                       TX_SIZE tx_size,
-                       int ref,
-                       int type) {
-  int e, c;
-  c = codenzc(nzc);
-  if (tx_size == TX_32X32)
-    cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++;
-  else if (tx_size == TX_16X16)
-    cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++;
-  else if (tx_size == TX_8X8)
-    cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++;
-  else if (tx_size == TX_4X4)
-    cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++;
-  else
-    assert(0);
-
-  if ((e = vp9_extranzcbits[c])) {
-    int x = nzc - vp9_basenzcvalue[c];
-    while (e--) {
-      int b = (x >> e) & 1;
-      cm->fc.nzc_pcat_counts[nzc_context][c - NZC_TOKENS_NOEXTRA][e][b]++;
-    }
-  }
-}
-
-static void update_nzcs_sb64(VP9_COMMON *cm,
-                             MACROBLOCKD *xd,
-                             int mb_row,
-                             int mb_col) {
-  MODE_INFO *m = xd->mode_info_context;
-  MB_MODE_INFO *const mi = &m->mbmi;
-  int j, nzc_context;
-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
-
-  assert(mb_col == get_mb_col(xd));
-  assert(mb_row == get_mb_row(xd));
-
-  if (mi->mb_skip_coeff)
-    return;
-
-  switch (mi->txfm_size) {
-    case TX_32X32:
-      for (j = 0; j < 256; j += 64) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0);
-      }
-      for (j = 256; j < 384; j += 64) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 1);
-      }
-      break;
-
-    case TX_16X16:
-      for (j = 0; j < 256; j += 16) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0);
-      }
-      for (j = 256; j < 384; j += 16) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1);
-      }
-      break;
-
-    case TX_8X8:
-      for (j = 0; j < 256; j += 4) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0);
-      }
-      for (j = 256; j < 384; j += 4) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);
-      }
-      break;
-
-    case TX_4X4:
-      for (j = 0; j < 256; ++j) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0);
-      }
-      for (j = 256; j < 384; ++j) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);
-      }
-      break;
-
-    default:
-      break;
-  }
-}
-
-static void update_nzcs_sb32(VP9_COMMON *cm,
-                            MACROBLOCKD *xd,
-                            int mb_row,
-                            int mb_col) {
-  MODE_INFO *m = xd->mode_info_context;
-  MB_MODE_INFO *const mi = &m->mbmi;
-  int j, nzc_context;
-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
-
-  assert(mb_col == get_mb_col(xd));
-  assert(mb_row == get_mb_row(xd));
-
-  if (mi->mb_skip_coeff)
-    return;
-
-  switch (mi->txfm_size) {
-    case TX_32X32:
-      for (j = 0; j < 64; j += 64) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0);
-      }
-      for (j = 64; j < 96; j += 16) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1);
-      }
-      break;
-
-    case TX_16X16:
-      for (j = 0; j < 64; j += 16) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0);
-      }
-      for (j = 64; j < 96; j += 16) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1);
-      }
-      break;
-
-    case TX_8X8:
-      for (j = 0; j < 64; j += 4) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0);
-      }
-      for (j = 64; j < 96; j += 4) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);
-      }
-      break;
-
-    case TX_4X4:
-      for (j = 0; j < 64; ++j) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0);
-      }
-      for (j = 64; j < 96; ++j) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);
-      }
-      break;
-
-    default:
-      break;
-  }
-}
-
-static void update_nzcs_mb16(VP9_COMMON *cm,
-                             MACROBLOCKD *xd,
-                             int mb_row,
-                             int mb_col) {
-  MODE_INFO *m = xd->mode_info_context;
-  MB_MODE_INFO *const mi = &m->mbmi;
-  int j, nzc_context;
-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
-
-  assert(mb_col == get_mb_col(xd));
-  assert(mb_row == get_mb_row(xd));
-
-  if (mi->mb_skip_coeff)
-    return;
-
-  switch (mi->txfm_size) {
-    case TX_16X16:
-      for (j = 0; j < 16; j += 16) {
-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0);
-      }
-      for (j = 16; j < 24; j += 4) {
-        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);
-      }
-      break;
-
-    case TX_8X8:
-      for (j = 0; j < 16; j += 4) {
-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0);
-      }
-      if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) {
-        for (j = 16; j < 24; ++j) {
-          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-          update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);
-        }
-      } else {
-        for (j = 16; j < 24; j += 4) {
-          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-          update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);
-        }
-      }
-      break;
-
-    case TX_4X4:
-      for (j = 0; j < 16; ++j) {
-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0);
-      }
-      for (j = 16; j < 24; ++j) {
-        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);
-      }
-      break;
-
-    default:
-      break;
-  }
-}
-
-void vp9_update_nzc_counts(VP9_COMMON *cm,
-                           MACROBLOCKD *xd,
-                           int mb_row,
-                           int mb_col) {
-  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64)
-    update_nzcs_sb64(cm, xd, mb_row, mb_col);
-  else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32)
-    update_nzcs_sb32(cm, xd, mb_row, mb_col);
-  else
-    update_nzcs_mb16(cm, xd, mb_row, mb_col);
-}
-#endif  // CONFIG_CODE_NONZEROCOUNT
-
 // #define COEF_COUNT_TESTING
 
 #define COEF_COUNT_SAT 24
@@ -3525,34 +652,61 @@
 #define COEF_COUNT_SAT_AFTER_KEY 24
 #define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
 
-static void adapt_coef_probs(vp9_coeff_probs *dst_coef_probs,
-                             vp9_coeff_probs *pre_coef_probs,
-                             int block_types, vp9_coeff_count *coef_counts,
-                             unsigned int (*eob_branch_count)[REF_TYPES]
-                                                             [COEF_BANDS]
-                                                      [PREV_COEF_CONTEXTS],
+void vp9_full_to_model_count(unsigned int *model_count,
+                             unsigned int *full_count) {
+  int n;
+  model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
+  model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
+  model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
+  for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n)
+    model_count[TWO_TOKEN] += full_count[n];
+  model_count[DCT_EOB_MODEL_TOKEN] = full_count[DCT_EOB_TOKEN];
+}
+
+void vp9_full_to_model_counts(
+    vp9_coeff_count_model *model_count, vp9_coeff_count *full_count) {
+  int i, j, k, l;
+  for (i = 0; i < BLOCK_TYPES; ++i)
+    for (j = 0; j < REF_TYPES; ++j)
+      for (k = 0; k < COEF_BANDS; ++k)
+        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+          if (l >= 3 && k == 0)
+            continue;
+          vp9_full_to_model_count(model_count[i][j][k][l],
+                                  full_count[i][j][k][l]);
+        }
+}
+
+static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE txfm_size,
                              int count_sat, int update_factor) {
+  vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[txfm_size];
+  vp9_coeff_probs_model *pre_coef_probs = cm->fc.pre_coef_probs[txfm_size];
+  vp9_coeff_count_model *coef_counts = cm->fc.coef_counts[txfm_size];
+  unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
+      cm->fc.eob_branch_counts[txfm_size];
   int t, i, j, k, l, count;
-  unsigned int branch_ct[ENTROPY_NODES][2];
-  vp9_prob coef_probs[ENTROPY_NODES];
   int factor;
-#if CONFIG_MODELCOEFPROB && MODEL_BASED_ADAPT
-  int entropy_nodes_adapt = UNCONSTRAINED_ADAPT_NODES;
-#else
-  int entropy_nodes_adapt = ENTROPY_NODES;
-#endif
+  unsigned int branch_ct[UNCONSTRAINED_NODES][2];
+  vp9_prob coef_probs[UNCONSTRAINED_NODES];
+  int entropy_nodes_adapt = UNCONSTRAINED_NODES;
 
-  for (i = 0; i < block_types; ++i)
+  for (i = 0; i < BLOCK_TYPES; ++i)
     for (j = 0; j < REF_TYPES; ++j)
       for (k = 0; k < COEF_BANDS; ++k)
         for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
           if (l >= 3 && k == 0)
             continue;
-          vp9_tree_probs_from_distribution(vp9_coef_tree,
-                                           coef_probs, branch_ct,
-                                           coef_counts[i][j][k][l], 0);
+          vp9_tree_probs_from_distribution(
+              vp9_coefmodel_tree,
+              coef_probs, branch_ct,
+              coef_counts[i][j][k][l], 0);
+#if CONFIG_BALANCED_COEFTREE
+          branch_ct[1][1] = eob_branch_count[i][j][k][l] - branch_ct[1][0];
+          coef_probs[1] = get_binary_prob(branch_ct[1][0], branch_ct[1][1]);
+#else
           branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
           coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
+#endif
           for (t = 0; t < entropy_nodes_adapt; ++t) {
             count = branch_ct[t][0] + branch_ct[t][1];
             count = count > count_sat ? count_sat : count;
@@ -3560,21 +714,16 @@
             dst_coef_probs[i][j][k][l][t] =
                 weighted_prob(pre_coef_probs[i][j][k][l][t],
                               coef_probs[t], factor);
-#if CONFIG_MODELCOEFPROB && MODEL_BASED_ADAPT
-            if (t == UNCONSTRAINED_NODES - 1)
-              vp9_get_model_distribution(
-                  dst_coef_probs[i][j][k][l][UNCONSTRAINED_NODES - 1],
-                  dst_coef_probs[i][j][k][l], i, j);
-#endif
           }
         }
 }
 
 void vp9_adapt_coef_probs(VP9_COMMON *cm) {
+  TX_SIZE t;
   int count_sat;
   int update_factor; /* denominator 256 */
 
-  if (cm->frame_type == KEY_FRAME) {
+  if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {
     update_factor = COEF_MAX_UPDATE_FACTOR_KEY;
     count_sat = COEF_COUNT_SAT_KEY;
   } else if (cm->last_frame_type == KEY_FRAME) {
@@ -3584,142 +733,6 @@
     update_factor = COEF_MAX_UPDATE_FACTOR;
     count_sat = COEF_COUNT_SAT;
   }
-
-  adapt_coef_probs(cm->fc.coef_probs_4x4, cm->fc.pre_coef_probs_4x4,
-                   BLOCK_TYPES, cm->fc.coef_counts_4x4,
-                   cm->fc.eob_branch_counts[TX_4X4],
-                   count_sat, update_factor);
-  adapt_coef_probs(cm->fc.coef_probs_8x8, cm->fc.pre_coef_probs_8x8,
-                   BLOCK_TYPES, cm->fc.coef_counts_8x8,
-                   cm->fc.eob_branch_counts[TX_8X8],
-                   count_sat, update_factor);
-  adapt_coef_probs(cm->fc.coef_probs_16x16, cm->fc.pre_coef_probs_16x16,
-                   BLOCK_TYPES, cm->fc.coef_counts_16x16,
-                   cm->fc.eob_branch_counts[TX_16X16],
-                   count_sat, update_factor);
-  adapt_coef_probs(cm->fc.coef_probs_32x32, cm->fc.pre_coef_probs_32x32,
-                   BLOCK_TYPES, cm->fc.coef_counts_32x32,
-                   cm->fc.eob_branch_counts[TX_32X32],
-                   count_sat, update_factor);
+  for (t = TX_4X4; t <= TX_32X32; t++)
+    adapt_coef_probs(cm, t, count_sat, update_factor);
 }
-
-#if CONFIG_CODE_NONZEROCOUNT
-static void adapt_nzc_probs(VP9_COMMON *cm,
-                            int block_size,
-                            int count_sat,
-                            int update_factor) {
-  int c, r, b, n;
-  int count, factor;
-  unsigned int nzc_branch_ct[NZC32X32_NODES][2];
-  vp9_prob nzc_probs[NZC32X32_NODES];
-  int tokens, nodes;
-  const vp9_tree_index *nzc_tree;
-  vp9_prob *dst_nzc_probs;
-  vp9_prob *pre_nzc_probs;
-  unsigned int *nzc_counts;
-
-  if (block_size == 32) {
-    tokens = NZC32X32_TOKENS;
-    nzc_tree = vp9_nzc32x32_tree;
-    dst_nzc_probs = cm->fc.nzc_probs_32x32[0][0][0];
-    pre_nzc_probs = cm->fc.pre_nzc_probs_32x32[0][0][0];
-    nzc_counts = cm->fc.nzc_counts_32x32[0][0][0];
-  } else if (block_size == 16) {
-    tokens = NZC16X16_TOKENS;
-    nzc_tree = vp9_nzc16x16_tree;
-    dst_nzc_probs = cm->fc.nzc_probs_16x16[0][0][0];
-    pre_nzc_probs = cm->fc.pre_nzc_probs_16x16[0][0][0];
-    nzc_counts = cm->fc.nzc_counts_16x16[0][0][0];
-  } else if (block_size == 8) {
-    tokens = NZC8X8_TOKENS;
-    nzc_tree = vp9_nzc8x8_tree;
-    dst_nzc_probs = cm->fc.nzc_probs_8x8[0][0][0];
-    pre_nzc_probs = cm->fc.pre_nzc_probs_8x8[0][0][0];
-    nzc_counts = cm->fc.nzc_counts_8x8[0][0][0];
-  } else {
-    nzc_tree = vp9_nzc4x4_tree;
-    tokens = NZC4X4_TOKENS;
-    dst_nzc_probs = cm->fc.nzc_probs_4x4[0][0][0];
-    pre_nzc_probs = cm->fc.pre_nzc_probs_4x4[0][0][0];
-    nzc_counts = cm->fc.nzc_counts_4x4[0][0][0];
-  }
-  nodes = tokens - 1;
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c)
-    for (r = 0; r < REF_TYPES; ++r)
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;
-        int offset_nodes = offset * nodes;
-        int offset_tokens = offset * tokens;
-        vp9_tree_probs_from_distribution(nzc_tree,
-                                         nzc_probs, nzc_branch_ct,
-                                         nzc_counts + offset_tokens, 0);
-        for (n = 0; n < nodes; ++n) {
-          count = nzc_branch_ct[n][0] + nzc_branch_ct[n][1];
-          count = count > count_sat ? count_sat : count;
-          factor = (update_factor * count / count_sat);
-          dst_nzc_probs[offset_nodes + n] =
-              weighted_prob(pre_nzc_probs[offset_nodes + n],
-                            nzc_probs[n], factor);
-        }
-      }
-}
-
-static void adapt_nzc_pcat(VP9_COMMON *cm, int count_sat, int update_factor) {
-  int c, t;
-  int count, factor;
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
-      int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];
-      int b;
-      for (b = 0; b < bits; ++b) {
-        vp9_prob prob = get_binary_prob(cm->fc.nzc_pcat_counts[c][t][b][0],
-                                        cm->fc.nzc_pcat_counts[c][t][b][1]);
-        count = cm->fc.nzc_pcat_counts[c][t][b][0] +
-                cm->fc.nzc_pcat_counts[c][t][b][1];
-        count = count > count_sat ? count_sat : count;
-        factor = (update_factor * count / count_sat);
-        cm->fc.nzc_pcat_probs[c][t][b] = weighted_prob(
-            cm->fc.pre_nzc_pcat_probs[c][t][b], prob, factor);
-      }
-    }
-  }
-}
-
-// #define NZC_COUNT_TESTING
-void vp9_adapt_nzc_probs(VP9_COMMON *cm) {
-  int count_sat;
-  int update_factor; /* denominator 256 */
-#ifdef NZC_COUNT_TESTING
-  int c, r, b, t;
-  printf("\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c)
-    for (r = 0; r < REF_TYPES; ++r) {
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        printf("    {");
-        for (t = 0; t < NZC4X4_TOKENS; ++t) {
-          printf(" %d,", cm->fc.nzc_counts_4x4[c][r][b][t]);
-        }
-        printf("}\n");
-      }
-      printf("\n");
-    }
-#endif
-
-  if (cm->frame_type == KEY_FRAME) {
-    update_factor = COEF_MAX_UPDATE_FACTOR_KEY;
-    count_sat = COEF_COUNT_SAT_KEY;
-  } else if (cm->last_frame_type == KEY_FRAME) {
-    update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY;  /* adapt quickly */
-    count_sat = COEF_COUNT_SAT_AFTER_KEY;
-  } else {
-    update_factor = COEF_MAX_UPDATE_FACTOR;
-    count_sat = COEF_COUNT_SAT;
-  }
-
-  adapt_nzc_probs(cm, 4, count_sat, update_factor);
-  adapt_nzc_probs(cm, 8, count_sat, update_factor);
-  adapt_nzc_probs(cm, 16, count_sat, update_factor);
-  adapt_nzc_probs(cm, 32, count_sat, update_factor);
-  adapt_nzc_pcat(cm, count_sat, update_factor);
-}
-#endif  // CONFIG_CODE_NONZEROCOUNT
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -16,8 +16,6 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
 
-extern const int vp9_i8x8_block[4];
-
 /* Coefficient token alphabet */
 
 #define ZERO_TOKEN              0       /* 0         Extra Bits 0+0 */
@@ -40,16 +38,19 @@
 
 extern const vp9_tree_index vp9_coef_tree[];
 
-extern struct vp9_token_struct vp9_coef_encodings[MAX_ENTROPY_TOKENS];
+#define DCT_EOB_MODEL_TOKEN     3      /* EOB       Extra Bits 0+0 */
+extern const vp9_tree_index vp9_coefmodel_tree[];
 
+extern struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
+
 typedef struct {
   vp9_tree_p tree;
   const vp9_prob *prob;
-  int Len;
+  int len;
   int base_val;
-} vp9_extra_bit_struct;
+} vp9_extra_bit;
 
-extern vp9_extra_bit_struct vp9_extra_bits[12];    /* indexed by token value */
+extern vp9_extra_bit vp9_extra_bits[12];    /* indexed by token value */
 
 #define PROB_UPDATE_BASELINE_COST   7
 
@@ -84,6 +85,8 @@
 /*# define DC_TOKEN_CONTEXTS        3*/ /* 00, 0!0, !0!0 */
 #define PREV_COEF_CONTEXTS          6
 
+// #define ENTROPY_STATS
+
 typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
                                     [MAX_ENTROPY_TOKENS];
 typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
@@ -96,173 +99,126 @@
 
 struct VP9Common;
 void vp9_default_coef_probs(struct VP9Common *);
-extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_4x4[16]);
+extern DECLARE_ALIGNED(16, const int, vp9_default_scan_4x4[16]);
 
 extern DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]);
 extern DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]);
 
-extern DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]);
+extern DECLARE_ALIGNED(64, const int, vp9_default_scan_8x8[64]);
 
 extern DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]);
 extern DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]);
 
-extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]);
+extern DECLARE_ALIGNED(16, const int, vp9_default_scan_16x16[256]);
 
 extern DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]);
 extern DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]);
 
-extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]);
+extern DECLARE_ALIGNED(16, const int, vp9_default_scan_32x32[1024]);
 
 void vp9_coef_tree_initialize(void);
 void vp9_adapt_coef_probs(struct VP9Common *);
 
-static INLINE void vp9_reset_mb_tokens_context(MACROBLOCKD* const xd) {
+static INLINE void vp9_reset_sb_tokens_context(MACROBLOCKD* const xd,
+                                               BLOCK_SIZE_TYPE bsize) {
   /* Clear entropy contexts */
-  vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+  const int bw = 1 << b_width_log2(bsize);
+  const int bh = 1 << b_height_log2(bsize);
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    vpx_memset(xd->plane[i].above_context, 0,
+               sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[i].subsampling_x);
+    vpx_memset(xd->plane[i].left_context, 0,
+               sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[i].subsampling_y);
+  }
 }
 
-static INLINE void vp9_reset_sb_tokens_context(MACROBLOCKD* const xd) {
-  /* Clear entropy contexts */
-  vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
-  vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
-}
+// This is the index in the scan order beyond which all coefficients for
+// 8x8 transform and above are in the top band.
+// For 4x4 blocks the index is less but to keep things common the lookup
+// table for 4x4 is padded out to this index.
+#define MAXBAND_INDEX 21
 
-static INLINE void vp9_reset_sb64_tokens_context(MACROBLOCKD* const xd) {
-  /* Clear entropy contexts */
-  vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 4);
-  vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 4);
-}
+extern const uint8_t vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1];
+extern const uint8_t vp9_coefband_trans_4x4[MAXBAND_INDEX + 1];
 
-extern const int vp9_coef_bands8x8[64];
-extern const int vp9_coef_bands4x4[16];
 
-static int get_coef_band(const int *scan, TX_SIZE tx_size, int coef_index) {
-  if (tx_size == TX_4X4) {
-    return vp9_coef_bands4x4[scan[coef_index]];
-  } else {
-    const int pos = scan[coef_index];
-    const int sz = 1 << (2 + tx_size);
-    const int x = pos & (sz - 1), y = pos >> (2 + tx_size);
-    if (x >= 8 || y >= 8)
-      return 5;
-    else
-      return vp9_coef_bands8x8[y * 8 + x];
-  }
+static int get_coef_band(const uint8_t * band_translate, int coef_index) {
+  return (coef_index > MAXBAND_INDEX)
+    ? (COEF_BANDS-1) : band_translate[coef_index];
 }
+
 extern int vp9_get_coef_context(const int *scan, const int *neighbors,
                                 int nb_pad, uint8_t *token_cache, int c, int l);
 const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad);
 
-#if CONFIG_MODELCOEFPROB
-#define COEFPROB_BITS               8
-#define COEFPROB_MODELS             (1 << COEFPROB_BITS)
 
-// 2 => EOB and Zero nodes are unconstrained, rest are modeled
-// 3 => EOB, Zero and One nodes are unconstrained, rest are modeled
-#define UNCONSTRAINED_NODES         3   // Choose one of 2 or 3
+// 128 lists of probabilities are stored for the following ONE node probs:
+// 1, 3, 5, 7, ..., 253, 255
+// In between probabilities are interpolated linearly
 
-// whether forward updates are model-based
-#define MODEL_BASED_UPDATE          0
-// if model-based how many nodes are unconstrained
-#define UNCONSTRAINED_UPDATE_NODES  3
-// whether backward updates are model-based
-#define MODEL_BASED_ADAPT           0
-#define UNCONSTRAINED_ADAPT_NODES   3
+#define COEFPROB_MODELS             128
 
-// whether to adjust the coef probs for key frames based on qindex
-#define ADJUST_KF_COEF_PROBS        0
+#define UNCONSTRAINED_NODES         3
+#define MODEL_NODES                 (ENTROPY_NODES - UNCONSTRAINED_NODES)
 
+#define PIVOT_NODE                  2   // which node is pivot
+
 typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS]
-                                      [PREV_COEF_CONTEXTS][2];
-extern const vp9_prob vp9_modelcoefprobs[COEFPROB_MODELS][ENTROPY_NODES - 1];
-void vp9_get_model_distribution(vp9_prob model, vp9_prob *tree_probs,
-                                int b, int r);
-void vp9_adjust_default_coef_probs(struct VP9Common *cm);
-#endif  // CONFIG_MODELCOEFPROB
+                                      [PREV_COEF_CONTEXTS]
+                                      [UNCONSTRAINED_NODES];
 
-#if CONFIG_CODE_NONZEROCOUNT
-/* Alphabet for number of non-zero symbols in block */
-#define NZC_0                   0       /* Used for all blocks */
-#define NZC_1                   1       /* Used for all blocks */
-#define NZC_2                   2       /* Used for all blocks */
-#define NZC_3TO4                3       /* Used for all blocks */
-#define NZC_5TO8                4       /* Used for all blocks */
-#define NZC_9TO16               5       /* Used for all blocks */
-#define NZC_17TO32              6       /* Used for 8x8 and larger blocks */
-#define NZC_33TO64              7       /* Used for 8x8 and larger blocks */
-#define NZC_65TO128             8       /* Used for 16x16 and larger blocks */
-#define NZC_129TO256            9       /* Used for 16x16 and larger blocks */
-#define NZC_257TO512           10       /* Used for 32x32 and larger blocks */
-#define NZC_513TO1024          11       /* Used for 32x32 and larger blocks */
+typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]
+                                          [PREV_COEF_CONTEXTS]
+                                          [UNCONSTRAINED_NODES + 1];
+typedef unsigned int vp9_coeff_stats_model[REF_TYPES][COEF_BANDS]
+                                          [PREV_COEF_CONTEXTS]
+                                          [UNCONSTRAINED_NODES][2];
+extern void vp9_full_to_model_count(unsigned int *model_count,
+                                    unsigned int *full_count);
+extern void vp9_full_to_model_counts(
+    vp9_coeff_count_model *model_count, vp9_coeff_count *full_count);
 
-/* Number of tokens for each block size */
-#define NZC4X4_TOKENS           6
-#define NZC8X8_TOKENS           8
-#define NZC16X16_TOKENS        10
-#define NZC32X32_TOKENS        12
+void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
 
-/* Number of nodes for each block size */
-#define NZC4X4_NODES            5
-#define NZC8X8_NODES            7
-#define NZC16X16_NODES          9
-#define NZC32X32_NODES         11
+void vp9_model_to_full_probs_sb(
+    vp9_prob model[COEF_BANDS][PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES],
+    vp9_prob full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]);
 
-/* Max number of tokens with extra bits */
-#define NZC_TOKENS_EXTRA        9
+extern const vp9_prob vp9_modelcoefprobs[COEFPROB_MODELS][ENTROPY_NODES - 1];
 
-/* Max number of extra bits */
-#define NZC_BITS_EXTRA          9
+static INLINE const int* get_scan_4x4(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_scan_4x4;
+    case DCT_ADST:
+      return vp9_col_scan_4x4;
+    default:
+      return vp9_default_scan_4x4;
+  }
+}
 
-/* Tokens without extra bits */
-#define NZC_TOKENS_NOEXTRA      (NZC32X32_TOKENS - NZC_TOKENS_EXTRA)
+static INLINE const int* get_scan_8x8(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_scan_8x8;
+    case DCT_ADST:
+      return vp9_col_scan_8x8;
+    default:
+      return vp9_default_scan_8x8;
+  }
+}
 
-#define MAX_NZC_CONTEXTS        3
-
-/* whether to update extra bit probabilities */
-#define NZC_PCAT_UPDATE
-
-/* nzc trees */
-extern const vp9_tree_index    vp9_nzc4x4_tree[];
-extern const vp9_tree_index    vp9_nzc8x8_tree[];
-extern const vp9_tree_index    vp9_nzc16x16_tree[];
-extern const vp9_tree_index    vp9_nzc32x32_tree[];
-
-/* nzc encodings */
-extern struct vp9_token_struct  vp9_nzc4x4_encodings[NZC4X4_TOKENS];
-extern struct vp9_token_struct  vp9_nzc8x8_encodings[NZC8X8_TOKENS];
-extern struct vp9_token_struct  vp9_nzc16x16_encodings[NZC16X16_TOKENS];
-extern struct vp9_token_struct  vp9_nzc32x32_encodings[NZC32X32_TOKENS];
-
-#define codenzc(x) (\
-  (x) <= 3 ? (x) : (x) <= 4 ? 3 : (x) <= 8 ? 4 : \
-  (x) <= 16 ? 5 : (x) <= 32 ? 6 : (x) <= 64 ? 7 :\
-  (x) <= 128 ? 8 : (x) <= 256 ? 9 : (x) <= 512 ? 10 : 11)
-
-int vp9_get_nzc_context_y_sb64(struct VP9Common *cm, MODE_INFO *cur,
-                               int mb_row, int mb_col, int block);
-int vp9_get_nzc_context_y_sb32(struct VP9Common *cm, MODE_INFO *cur,
-                               int mb_row, int mb_col, int block);
-int vp9_get_nzc_context_y_mb16(struct VP9Common *cm, MODE_INFO *cur,
-                               int mb_row, int mb_col, int block);
-int vp9_get_nzc_context_uv_sb64(struct VP9Common *cm, MODE_INFO *cur,
-                                int mb_row, int mb_col, int block);
-int vp9_get_nzc_context_uv_sb32(struct VP9Common *cm, MODE_INFO *cur,
-                                int mb_row, int mb_col, int block);
-int vp9_get_nzc_context_uv_mb16(struct VP9Common *cm, MODE_INFO *cur,
-                                int mb_row, int mb_col, int block);
-int vp9_get_nzc_context(struct VP9Common *cm, MACROBLOCKD *xd, int block);
-void vp9_update_nzc_counts(struct VP9Common *cm, MACROBLOCKD *xd,
-                           int mb_row, int mb_col);
-void vp9_adapt_nzc_probs(struct VP9Common *cm);
-
-/* Extra bits array */
-extern const int vp9_extranzcbits[NZC32X32_TOKENS];
-
-/* Base nzc values */
-extern const int vp9_basenzcvalue[NZC32X32_TOKENS];
-
-#endif  // CONFIG_CODE_NONZEROCOUNT
+static INLINE const int* get_scan_16x16(TX_TYPE tx_type) {
+  switch (tx_type) {
+    case ADST_DCT:
+      return vp9_row_scan_16x16;
+    case DCT_ADST:
+      return vp9_col_scan_16x16;
+    default:
+      return vp9_default_scan_16x16;
+  }
+}
 
 #include "vp9/common/vp9_coefupdateprobs.h"
 
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -15,464 +15,274 @@
 #include "vp9/common/vp9_alloccommon.h"
 #include "vpx_mem/vpx_mem.h"
 
-static const unsigned int kf_y_mode_cts[8][VP9_YMODES] = {
-  /* DC V   H  D45 135 117 153 D27 D63 TM i8x8 BPRED */
-  {12,  6,  5,  5,  5,  5,  5,  5,  5,  2, 22, 200},
-  {25, 13, 13,  7,  7,  7,  7,  7,  7,  6, 27, 160},
-  {31, 17, 18,  8,  8,  8,  8,  8,  8,  9, 26, 139},
-  {40, 22, 23,  8,  8,  8,  8,  8,  8, 12, 27, 116},
-  {53, 26, 28,  8,  8,  8,  8,  8,  8, 13, 26,  94},
-  {68, 33, 35,  8,  8,  8,  8,  8,  8, 17, 20,  68},
-  {78, 38, 38,  8,  8,  8,  8,  8,  8, 19, 16,  52},
-  {89, 42, 42,  8,  8,  8,  8,  8,  8, 21, 12,  34},
+static const vp9_prob default_kf_uv_probs[VP9_INTRA_MODES]
+                                         [VP9_INTRA_MODES - 1] = {
+  { 144,  11,  54, 157, 195, 130,  46,  58, 108 } /* y = dc */,
+  { 118,  15, 123, 148, 131, 101,  44,  93, 131 } /* y = v */,
+  { 113,  12,  23, 188, 226, 142,  26,  32, 125 } /* y = h */,
+  { 120,  11,  50, 123, 163, 135,  64,  77, 103 } /* y = d45 */,
+  { 113,   9,  36, 155, 111, 157,  32,  44, 161 } /* y = d135 */,
+  { 116,   9,  55, 176,  76,  96,  37,  61, 149 } /* y = d117 */,
+  { 115,   9,  28, 141, 161, 167,  21,  25, 193 } /* y = d153 */,
+  { 120,  12,  32, 145, 195, 142,  32,  38,  86 } /* y = d27 */,
+  { 116,  12,  64, 120, 140, 125,  49, 115, 121 } /* y = d63 */,
+  { 102,  19,  66, 162, 182, 122,  35,  59, 128 } /* y = tm */
 };
 
-static const unsigned int y_mode_cts  [VP9_YMODES] = {
-  /* DC V   H  D45 135 117 153 D27 D63 TM i8x8 BPRED */
-  98, 19, 15, 14, 14, 14, 14, 12, 12, 13, 16, 70
+static const vp9_prob default_if_y_probs[BLOCK_SIZE_GROUPS]
+                                        [VP9_INTRA_MODES - 1] = {
+  {  65,  32,  18, 144, 162, 194,  41,  51,  98 } /* block_size < 8x8 */,
+  { 132,  68,  18, 165, 217, 196,  45,  40,  78 } /* block_size < 16x16 */,
+  { 173,  80,  19, 176, 240, 193,  64,  35,  46 } /* block_size < 32x32 */,
+  { 221, 135,  38, 194, 248, 121,  96,  85,  29 } /* block_size >= 32x32 */
 };
 
-static const unsigned int uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {
-  /* DC   V   H  D45 135 117 153 D27 D63 TM */
-  { 200, 15, 15, 10, 10, 10, 10, 10, 10,  6}, /* DC */
-  { 130, 75, 10, 10, 10, 10, 10, 10, 10,  6}, /* V */
-  { 130, 10, 75, 10, 10, 10, 10, 10, 10,  6}, /* H */
-  { 130, 15, 10, 75, 10, 10, 10, 10, 10,  6}, /* D45 */
-  { 150, 15, 10, 10, 75, 10, 10, 10, 10,  6}, /* D135 */
-  { 150, 15, 10, 10, 10, 75, 10, 10, 10,  6}, /* D117 */
-  { 150, 15, 10, 10, 10, 10, 75, 10, 10,  6}, /* D153 */
-  { 150, 15, 10, 10, 10, 10, 10, 75, 10,  6}, /* D27 */
-  { 150, 15, 10, 10, 10, 10, 10, 10, 75,  6}, /* D63 */
-  { 160, 30, 30, 10, 10, 10, 10, 10, 10, 16}, /* TM */
-  { 132, 46, 40, 10, 10, 10, 10, 10, 10, 18}, /* i8x8 - never used */
-  { 150, 35, 41, 10, 10, 10, 10, 10, 10, 10}, /* BPRED */
+static const vp9_prob default_if_uv_probs[VP9_INTRA_MODES]
+                                         [VP9_INTRA_MODES - 1] = {
+  { 120,   7,  76, 176, 208, 126,  28,  54, 103 } /* y = dc */,
+  {  48,  12, 154, 155, 139,  90,  34, 117, 119 } /* y = v */,
+  {  67,   6,  25, 204, 243, 158,  13,  21,  96 } /* y = h */,
+  {  97,   5,  44, 131, 176, 139,  48,  68,  97 } /* y = d45 */,
+  {  83,   5,  42, 156, 111, 152,  26,  49, 152 } /* y = d135 */,
+  {  80,   5,  58, 178,  74,  83,  33,  62, 145 } /* y = d117 */,
+  {  86,   5,  32, 154, 192, 168,  14,  22, 163 } /* y = d153 */,
+  {  85,   5,  32, 156, 216, 148,  19,  29,  73 } /* y = d27 */,
+  {  77,   7,  64, 116, 132, 122,  37, 126, 120 } /* y = d63 */,
+  { 101,  21, 107, 181, 192, 103,  19,  67, 125 } /* y = tm */
 };
 
-static const unsigned int i8x8_mode_cts  [VP9_I8X8_MODES] = {
-  /* DC V   H D45 135 117 153 D27 D63  TM */
-  73, 49, 61, 30, 30, 30, 30, 30, 30, 13
+const vp9_prob vp9_partition_probs[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS]
+                                  [PARTITION_TYPES - 1] = {
+  { /* frame_type = keyframe */
+    /* 8x8 -> 4x4 */
+    { 158,  97,  94 } /* a/l both not split */,
+    {  93,  24,  99 } /* a split, l not split */,
+    {  85, 119,  44 } /* l split, a not split */,
+    {  62,  59,  67 } /* a/l both split */,
+    /* 16x16 -> 8x8 */
+    { 149,  53,  53 } /* a/l both not split */,
+    {  94,  20,  48 } /* a split, l not split */,
+    {  83,  53,  24 } /* l split, a not split */,
+    {  52,  18,  18 } /* a/l both split */,
+    /* 32x32 -> 16x16 */
+    { 150,  40,  39 } /* a/l both not split */,
+    {  78,  12,  26 } /* a split, l not split */,
+    {  67,  33,  11 } /* l split, a not split */,
+    {  24,   7,   5 } /* a/l both split */,
+    /* 64x64 -> 32x32 */
+    { 174,  35,  49 } /* a/l both not split */,
+    {  68,  11,  27 } /* a split, l not split */,
+    {  57,  15,   9 } /* l split, a not split */,
+    {  12,   3,   3 } /* a/l both split */
+  }, { /* frame_type = interframe */
+    /* 8x8 -> 4x4 */
+    { 199, 122, 141 } /* a/l both not split */,
+    { 147,  63, 159 } /* a split, l not split */,
+    { 148, 133, 118 } /* l split, a not split */,
+    { 121, 104, 114 } /* a/l both split */,
+    /* 16x16 -> 8x8 */
+    { 174,  73,  87 } /* a/l both not split */,
+    {  92,  41,  83 } /* a split, l not split */,
+    {  82,  99,  50 } /* l split, a not split */,
+    {  53,  39,  39 } /* a/l both split */,
+    /* 32x32 -> 16x16 */
+    { 177,  58,  59 } /* a/l both not split */,
+    {  68,  26,  63 } /* a split, l not split */,
+    {  52,  79,  25 } /* l split, a not split */,
+    {  17,  14,  12 } /* a/l both split */,
+    /* 64x64 -> 32x32 */
+    { 222,  34,  30 } /* a/l both not split */,
+    {  72,  16,  44 } /* a split, l not split */,
+    {  58,  32,  12 } /* l split, a not split */,
+    {  10,   7,   6 } /* a/l both split */
+  }
 };
 
-static const unsigned int kf_uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {
-  // DC   V   H  D45 135 117 153 D27 D63 TM
-  { 160, 24, 24, 20, 20, 20, 20, 20, 20,  8}, /* DC */
-  { 102, 64, 30, 20, 20, 20, 20, 20, 20, 10}, /* V */
-  { 102, 30, 64, 20, 20, 20, 20, 20, 20, 10}, /* H */
-  { 102, 33, 20, 64, 20, 20, 20, 20, 20, 14}, /* D45 */
-  { 102, 33, 20, 20, 64, 20, 20, 20, 20, 14}, /* D135 */
-  { 122, 33, 20, 20, 20, 64, 20, 20, 20, 14}, /* D117 */
-  { 102, 33, 20, 20, 20, 20, 64, 20, 20, 14}, /* D153 */
-  { 102, 33, 20, 20, 20, 20, 20, 64, 20, 14}, /* D27 */
-  { 102, 33, 20, 20, 20, 20, 20, 20, 64, 14}, /* D63 */
-  { 132, 36, 30, 20, 20, 20, 20, 20, 20, 18}, /* TM */
-  { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* i8x8 - never used */
-  { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* BPRED */
+/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
+const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = {
+  -DC_PRED, 2,                      /* 0 = DC_NODE */
+  -TM_PRED, 4,                      /* 1 = TM_NODE */
+  -V_PRED, 6,                       /* 2 = V_NODE */
+  8, 12,                            /* 3 = COM_NODE */
+  -H_PRED, 10,                      /* 4 = H_NODE */
+  -D135_PRED, -D117_PRED,           /* 5 = D135_NODE */
+  -D45_PRED, 14,                    /* 6 = D45_NODE */
+  -D63_PRED, 16,                    /* 7 = D63_NODE */
+  -D153_PRED, -D27_PRED             /* 8 = D153_NODE */
 };
 
-static const unsigned int bmode_cts[VP9_NKF_BINTRAMODES] = {
-#if CONFIG_NEWBINTRAMODES
-#if CONTEXT_PRED_REPLACEMENTS == 6
-  /* DC    TM     VE     HE   CONTEXT */
-  43891, 17694, 10036, 3920, 20000
-#elif CONTEXT_PRED_REPLACEMENTS == 4
-  /* DC    TM     VE     HE   LD    RD   CONTEXT */
-  43891, 17694, 10036, 3920, 3363, 2546, 14000
-#elif CONTEXT_PRED_REPLACEMENTS == 0
-  /* DC    TM     VE     HE   LD    RD   VR    VL    HD    HU   CONTEXT */
-  43891, 17694, 10036, 3920, 3363, 2546, 5119, 3221, 2471, 1723, 50000
-#endif
-#else
-  /* DC    TM     VE     HE   LD    RD    VR    VL    HD    HU */
-  43891, 17694, 10036, 3920, 3363, 2546, 5119, 3221, 2471, 1723
-#endif
+const vp9_tree_index vp9_sb_mv_ref_tree[6] = {
+  -ZEROMV, 2,
+  -NEARESTMV, 4,
+  -NEARMV, -NEWMV
 };
 
-typedef enum {
-  SUBMVREF_NORMAL,
-  SUBMVREF_LEFT_ZED,
-  SUBMVREF_ABOVE_ZED,
-  SUBMVREF_LEFT_ABOVE_SAME,
-  SUBMVREF_LEFT_ABOVE_ZED
-} sumvfref_t;
+const vp9_tree_index vp9_partition_tree[6] = {
+  -PARTITION_NONE, 2,
+  -PARTITION_HORZ, 4,
+  -PARTITION_VERT, -PARTITION_SPLIT
+};
 
-int vp9_mv_cont(const int_mv *l, const int_mv *a) {
-  int lez = (l->as_int == 0);
-  int aez = (a->as_int == 0);
-  int lea = (l->as_int == a->as_int);
+struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES];
 
-  if (lea && lez)
-    return SUBMVREF_LEFT_ABOVE_ZED;
+struct vp9_token vp9_sb_mv_ref_encoding_array[VP9_INTER_MODES];
 
-  if (lea)
-    return SUBMVREF_LEFT_ABOVE_SAME;
+struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
 
-  if (aez)
-    return SUBMVREF_ABOVE_ZED;
-
-  if (lez)
-    return SUBMVREF_LEFT_ZED;
-
-  return SUBMVREF_NORMAL;
-}
-
-const vp9_prob vp9_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP9_SUBMVREFS - 1] = {
-  { 147, 136, 18 },
-  { 106, 145, 1  },
-  { 179, 121, 1  },
-  { 223, 1, 34 },
-  { 208, 1, 1  }
+static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
+  9, 102, 187, 225
 };
 
-vp9_mbsplit vp9_mbsplits [VP9_NUMMBSPLITS] = {
-  {
-    0,  0,  0,  0,
-    0,  0,  0,  0,
-    1,  1,  1,  1,
-    1,  1,  1,  1,
-  }, {
-    0,  0,  1,  1,
-    0,  0,  1,  1,
-    0,  0,  1,  1,
-    0,  0,  1,  1,
-  }, {
-    0,  0,  1,  1,
-    0,  0,  1,  1,
-    2,  2,  3,  3,
-    2,  2,  3,  3,
-  }, {
-    0,  1,  2,  3,
-    4,  5,  6,  7,
-    8,  9,  10, 11,
-    12, 13, 14, 15,
-  },
+static const vp9_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = {
+  239, 183, 119,  96,  41
 };
 
-const int vp9_mbsplit_count [VP9_NUMMBSPLITS] = { 2, 2, 4, 16};
-
-const vp9_prob vp9_mbsplit_probs [VP9_NUMMBSPLITS - 1] = { 110, 111, 150};
-
-/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
-
-const vp9_tree_index vp9_kf_bmode_tree[VP9_KF_BINTRAMODES * 2 - 2] = {
-  -B_DC_PRED, 2,                      /* 0 = DC_NODE */
-  -B_TM_PRED, 4,                      /* 1 = TM_NODE */
-  -B_VE_PRED, 6,                      /* 2 = VE_NODE */
-  8, 12,                              /* 3 = COM_NODE */
-  -B_HE_PRED, 10,                     /* 4 = HE_NODE */
-  -B_RD_PRED, -B_VR_PRED,             /* 5 = RD_NODE */
-  -B_LD_PRED, 14,                     /* 6 = LD_NODE */
-  -B_VL_PRED, 16,                     /* 7 = VL_NODE */
-  -B_HD_PRED, -B_HU_PRED              /* 8 = HD_NODE */
+static const vp9_prob default_comp_ref_p[REF_CONTEXTS] = {
+  50, 126, 123, 221, 226
 };
 
-const vp9_tree_index vp9_bmode_tree[VP9_NKF_BINTRAMODES * 2 - 2] = {
-#if CONFIG_NEWBINTRAMODES
-#if CONTEXT_PRED_REPLACEMENTS == 6
-  -B_DC_PRED, 2,
-  -B_TM_PRED, 4,
-  6, -(B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS),
-  -B_VE_PRED, -B_HE_PRED
-#elif CONTEXT_PRED_REPLACEMENTS == 4
-  -B_DC_PRED, 2,
-  -B_TM_PRED, 4,
-  6, 8,
-  -B_VE_PRED, -B_HE_PRED,
-  10, -(B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS),
-  -B_RD_PRED, -B_LD_PRED,
-#elif CONTEXT_PRED_REPLACEMENTS == 0
-  -B_DC_PRED, 2,                      /* 0 = DC_NODE */
-  -B_TM_PRED, 4,                      /* 1 = TM_NODE */
-  -B_VE_PRED, 6,                      /* 2 = VE_NODE */
-  8, 12,                              /* 3 = COM_NODE */
-  -B_HE_PRED, 10,                     /* 4 = HE_NODE */
-  -B_RD_PRED, -B_VR_PRED,             /* 5 = RD_NODE */
-  -B_LD_PRED, 14,                     /* 6 = LD_NODE */
-  -B_VL_PRED, 16,                     /* 7 = VL_NODE */
-  -B_HD_PRED, 18,
-  -B_HU_PRED, -B_CONTEXT_PRED
-#endif
-#else
-  -B_DC_PRED, 2,                      /* 0 = DC_NODE */
-  -B_TM_PRED, 4,                      /* 1 = TM_NODE */
-  -B_VE_PRED, 6,                      /* 2 = VE_NODE */
-  8, 12,                              /* 3 = COM_NODE */
-  -B_HE_PRED, 10,                     /* 4 = HE_NODE */
-  -B_RD_PRED, -B_VR_PRED,             /* 5 = RD_NODE */
-  -B_LD_PRED, 14,                     /* 6 = LD_NODE */
-  -B_VL_PRED, 16,                     /* 7 = VL_NODE */
-  -B_HD_PRED, -B_HU_PRED              /* 8 = HD_NODE */
-#endif
+static const vp9_prob default_single_ref_p[REF_CONTEXTS][2] = {
+  {  33,  16 },
+  {  77,  74 },
+  { 142, 142 },
+  { 172, 170 },
+  { 238, 247 }
 };
 
-/* Again, these trees use the same probability indices as their
-   explicitly-programmed predecessors. */
-const vp9_tree_index vp9_ymode_tree[VP9_YMODES * 2 - 2] = {
-  2, 14,
-  -DC_PRED, 4,
-  6, 8,
-  -D45_PRED, -D135_PRED,
-  10, 12,
-  -D117_PRED, -D153_PRED,
-  -D27_PRED, -D63_PRED,
-  16, 18,
-  -V_PRED, -H_PRED,
-  -TM_PRED, 20,
-  -B_PRED, -I8X8_PRED
+const vp9_prob vp9_default_tx_probs_32x32p[TX_SIZE_CONTEXTS]
+                                          [TX_SIZE_MAX_SB - 1] = {
+  { 3, 136, 37, },
+  { 5, 52, 13, },
 };
-
-const vp9_tree_index vp9_kf_ymode_tree[VP9_YMODES * 2 - 2] = {
-  2, 14,
-  -DC_PRED, 4,
-  6, 8,
-  -D45_PRED, -D135_PRED,
-  10, 12,
-  -D117_PRED, -D153_PRED,
-  -D27_PRED, -D63_PRED,
-  16, 18,
-  -V_PRED, -H_PRED,
-  -TM_PRED, 20,
-  -B_PRED, -I8X8_PRED
+const vp9_prob vp9_default_tx_probs_16x16p[TX_SIZE_CONTEXTS]
+                                          [TX_SIZE_MAX_SB - 2] = {
+  { 20, 152, },
+  { 15, 101, },
 };
-
-const vp9_tree_index vp9_i8x8_mode_tree[VP9_I8X8_MODES * 2 - 2] = {
-  2, 14,
-  -DC_PRED, 4,
-  6, 8,
-  -D45_PRED, -D135_PRED,
-  10, 12,
-  -D117_PRED, -D153_PRED,
-  -D27_PRED, -D63_PRED,
-  -V_PRED, 16,
-  -H_PRED, -TM_PRED
+const vp9_prob vp9_default_tx_probs_8x8p[TX_SIZE_CONTEXTS]
+                                        [TX_SIZE_MAX_SB - 3] = {
+  { 100, },
+  { 66, },
 };
 
-const vp9_tree_index vp9_uv_mode_tree[VP9_UV_MODES * 2 - 2] = {
-  2, 14,
-  -DC_PRED, 4,
-  6, 8,
-  -D45_PRED, -D135_PRED,
-  10, 12,
-  -D117_PRED, -D153_PRED,
-  -D27_PRED, -D63_PRED,
-  -V_PRED, 16,
-  -H_PRED, -TM_PRED
-};
+void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
+                                      unsigned int (*ct_32x32p)[2]) {
+  ct_32x32p[0][0] = tx_count_32x32p[TX_4X4];
+  ct_32x32p[0][1] = tx_count_32x32p[TX_8X8] +
+                    tx_count_32x32p[TX_16X16] +
+                    tx_count_32x32p[TX_32X32];
+  ct_32x32p[1][0] = tx_count_32x32p[TX_8X8];
+  ct_32x32p[1][1] = tx_count_32x32p[TX_16X16] +
+                    tx_count_32x32p[TX_32X32];
+  ct_32x32p[2][0] = tx_count_32x32p[TX_16X16];
+  ct_32x32p[2][1] = tx_count_32x32p[TX_32X32];
+}
 
-const vp9_tree_index vp9_mbsplit_tree[6] = {
-  -PARTITIONING_4X4,   2,
-  -PARTITIONING_8X8,   4,
-  -PARTITIONING_16X8, -PARTITIONING_8X16,
-};
+void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
+                                      unsigned int (*ct_16x16p)[2]) {
+  ct_16x16p[0][0] = tx_count_16x16p[TX_4X4];
+  ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] +
+                    tx_count_16x16p[TX_16X16];
+  ct_16x16p[1][0] = tx_count_16x16p[TX_8X8];
+  ct_16x16p[1][1] = tx_count_16x16p[TX_16X16];
+}
 
-const vp9_tree_index vp9_mv_ref_tree[8] = {
-  -ZEROMV, 2,
-  -NEARESTMV, 4,
-  -NEARMV, 6,
-  -NEWMV, -SPLITMV
-};
+void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p,
+                                    unsigned int (*ct_8x8p)[2]) {
+  ct_8x8p[0][0] =   tx_count_8x8p[TX_4X4];
+  ct_8x8p[0][1] =   tx_count_8x8p[TX_8X8];
+}
 
-const vp9_tree_index vp9_sb_mv_ref_tree[6] = {
-  -ZEROMV, 2,
-  -NEARESTMV, 4,
-  -NEARMV, -NEWMV
+const vp9_prob vp9_default_mbskip_probs[MBSKIP_CONTEXTS] = {
+  192, 128, 64
 };
 
-const vp9_tree_index vp9_sub_mv_ref_tree[6] = {
-  -LEFT4X4, 2,
-  -ABOVE4X4, 4,
-  -ZERO4X4, -NEW4X4
-};
-
-struct vp9_token_struct vp9_bmode_encodings[VP9_NKF_BINTRAMODES];
-struct vp9_token_struct vp9_kf_bmode_encodings[VP9_KF_BINTRAMODES];
-struct vp9_token_struct vp9_ymode_encodings[VP9_YMODES];
-struct vp9_token_struct vp9_sb_ymode_encodings[VP9_I32X32_MODES];
-struct vp9_token_struct vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES];
-struct vp9_token_struct vp9_kf_ymode_encodings[VP9_YMODES];
-struct vp9_token_struct vp9_uv_mode_encodings[VP9_UV_MODES];
-struct vp9_token_struct vp9_i8x8_mode_encodings[VP9_I8X8_MODES];
-struct vp9_token_struct vp9_mbsplit_encodings[VP9_NUMMBSPLITS];
-
-struct vp9_token_struct vp9_mv_ref_encoding_array[VP9_MVREFS];
-struct vp9_token_struct vp9_sb_mv_ref_encoding_array[VP9_MVREFS];
-struct vp9_token_struct vp9_sub_mv_ref_encoding_array[VP9_SUBMVREFS];
-
 void vp9_init_mbmode_probs(VP9_COMMON *x) {
-  unsigned int bct [VP9_YMODES] [2];      /* num Ymodes > num UV modes */
+  vpx_memcpy(x->fc.uv_mode_prob, default_if_uv_probs,
+             sizeof(default_if_uv_probs));
+  vpx_memcpy(x->kf_uv_mode_prob, default_kf_uv_probs,
+             sizeof(default_kf_uv_probs));
+  vpx_memcpy(x->fc.y_mode_prob, default_if_y_probs,
+             sizeof(default_if_y_probs));
 
-  vp9_tree_probs_from_distribution(vp9_ymode_tree, x->fc.ymode_prob,
-                                   bct, y_mode_cts, 0);
-  vp9_tree_probs_from_distribution(vp9_sb_ymode_tree, x->fc.sb_ymode_prob,
-                                   bct, y_mode_cts, 0);
-  {
-    int i;
-    for (i = 0; i < 8; i++) {
-      vp9_tree_probs_from_distribution(vp9_kf_ymode_tree, x->kf_ymode_prob[i],
-                                       bct, kf_y_mode_cts[i], 0);
-      vp9_tree_probs_from_distribution(vp9_sb_kf_ymode_tree,
-                                       x->sb_kf_ymode_prob[i], bct,
-                                       kf_y_mode_cts[i], 0);
-    }
-  }
-  {
-    int i;
-    for (i = 0; i < VP9_YMODES; i++) {
-      vp9_tree_probs_from_distribution(vp9_uv_mode_tree, x->kf_uv_mode_prob[i],
-                                       bct, kf_uv_mode_cts[i], 0);
-      vp9_tree_probs_from_distribution(vp9_uv_mode_tree, x->fc.uv_mode_prob[i],
-                                       bct, uv_mode_cts[i], 0);
-    }
-  }
-
-  vp9_tree_probs_from_distribution(vp9_i8x8_mode_tree, x->fc.i8x8_mode_prob,
-                                   bct, i8x8_mode_cts, 0);
-
-  vpx_memcpy(x->fc.sub_mv_ref_prob, vp9_sub_mv_ref_prob2,
-             sizeof(vp9_sub_mv_ref_prob2));
-  vpx_memcpy(x->fc.mbsplit_prob, vp9_mbsplit_probs, sizeof(vp9_mbsplit_probs));
   vpx_memcpy(x->fc.switchable_interp_prob, vp9_switchable_interp_prob,
              sizeof(vp9_switchable_interp_prob));
-#if CONFIG_COMP_INTERINTRA_PRED
-  x->fc.interintra_prob = VP9_DEF_INTERINTRA_PROB;
-#endif
-  x->ref_pred_probs[0] = 120;
-  x->ref_pred_probs[1] = 80;
-  x->ref_pred_probs[2] = 40;
-}
 
+  vpx_memcpy(x->fc.partition_prob, vp9_partition_probs,
+             sizeof(vp9_partition_probs));
 
-static void intra_bmode_probs_from_distribution(
-  vp9_prob p[VP9_NKF_BINTRAMODES - 1],
-  unsigned int branch_ct[VP9_NKF_BINTRAMODES - 1][2],
-  const unsigned int events[VP9_NKF_BINTRAMODES]) {
-  vp9_tree_probs_from_distribution(vp9_bmode_tree, p, branch_ct, events, 0);
+  vpx_memcpy(x->fc.intra_inter_prob, default_intra_inter_p,
+             sizeof(default_intra_inter_p));
+  vpx_memcpy(x->fc.comp_inter_prob, default_comp_inter_p,
+             sizeof(default_comp_inter_p));
+  vpx_memcpy(x->fc.comp_ref_prob, default_comp_ref_p,
+             sizeof(default_comp_ref_p));
+  vpx_memcpy(x->fc.single_ref_prob, default_single_ref_p,
+             sizeof(default_single_ref_p));
+  vpx_memcpy(x->fc.tx_probs_32x32p, vp9_default_tx_probs_32x32p,
+             sizeof(vp9_default_tx_probs_32x32p));
+  vpx_memcpy(x->fc.tx_probs_16x16p, vp9_default_tx_probs_16x16p,
+             sizeof(vp9_default_tx_probs_16x16p));
+  vpx_memcpy(x->fc.tx_probs_8x8p, vp9_default_tx_probs_8x8p,
+             sizeof(vp9_default_tx_probs_8x8p));
+  vpx_memcpy(x->fc.mbskip_probs, vp9_default_mbskip_probs,
+             sizeof(vp9_default_mbskip_probs));
 }
 
-void vp9_default_bmode_probs(vp9_prob p[VP9_NKF_BINTRAMODES - 1]) {
-  unsigned int branch_ct[VP9_NKF_BINTRAMODES - 1][2];
-  intra_bmode_probs_from_distribution(p, branch_ct, bmode_cts);
-}
-
-static void intra_kf_bmode_probs_from_distribution(
-  vp9_prob p[VP9_KF_BINTRAMODES - 1],
-  unsigned int branch_ct[VP9_KF_BINTRAMODES - 1][2],
-  const unsigned int events[VP9_KF_BINTRAMODES]) {
-  vp9_tree_probs_from_distribution(vp9_kf_bmode_tree, p, branch_ct, events, 0);
-}
-
-void vp9_kf_default_bmode_probs(vp9_prob p[VP9_KF_BINTRAMODES]
-                                          [VP9_KF_BINTRAMODES]
-                                          [VP9_KF_BINTRAMODES - 1]) {
-  unsigned int branch_ct[VP9_KF_BINTRAMODES - 1][2];
-  int i, j;
-
-  for (i = 0; i < VP9_KF_BINTRAMODES; ++i) {
-    for (j = 0; j < VP9_KF_BINTRAMODES; ++j) {
-      intra_kf_bmode_probs_from_distribution(
-          p[i][j], branch_ct, vp9_kf_default_bmode_counts[i][j]);
-    }
-  }
-}
-
-#if VP9_SWITCHABLE_FILTERS == 3
 const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
   -0, 2,
   -1, -2
 };
-struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
-#if CONFIG_ENABLE_6TAP
+struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
 const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
-  SIXTAP, EIGHTTAP, EIGHTTAP_SHARP};
-const int vp9_switchable_interp_map[SWITCHABLE+1] = {0, -1, 1, 2, -1, -1};
-#else
-const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
   EIGHTTAP, EIGHTTAP_SMOOTH, EIGHTTAP_SHARP};
 const int vp9_switchable_interp_map[SWITCHABLE+1] = {1, 0, 2, -1, -1};
-#endif
 const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]
                                           [VP9_SWITCHABLE_FILTERS-1] = {
-  {248, 192}, { 32, 248}, { 32,  32}, {192, 160}
+  { 235, 162, },
+  { 36, 255, },
+  { 34, 3, },
+  { 149, 144, },
 };
-#elif VP9_SWITCHABLE_FILTERS == 2
-const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = {
-  -0, -1,
-};
-struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
-const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]
-                                          [VP9_SWITCHABLE_FILTERS-1] = {
-  {248},
-  { 64},
-  {192},
-};
-const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
-  EIGHTTAP, EIGHTTAP_SHARP};
-#if CONFIG_ENABLE_6TAP
-const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, -1, 0, 1, -1, -1};
-#else
-const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, 0, 1, -1, -1};
-#endif
-#endif  // VP9_SWITCHABLE_FILTERS
 
 // Indicates if the filter is interpolating or non-interpolating
-// Note currently only the EIGHTTAP_SMOOTH is non-interpolating
-#if CONFIG_ENABLE_6TAP
-const int vp9_is_interpolating_filter[SWITCHABLE + 1] = {1, 0, 1, 1, 1, -1};
-#else
-const int vp9_is_interpolating_filter[SWITCHABLE + 1] = {0, 1, 1, 1, -1};
-#endif
+const int vp9_is_interpolating_filter[SWITCHABLE + 1] = {1, 1, 1, 1, -1};
 
 void vp9_entropy_mode_init() {
-  vp9_tokens_from_tree(vp9_kf_bmode_encodings,   vp9_kf_bmode_tree);
-  vp9_tokens_from_tree(vp9_bmode_encodings,   vp9_bmode_tree);
-  vp9_tokens_from_tree(vp9_ymode_encodings,   vp9_ymode_tree);
-  vp9_tokens_from_tree(vp9_kf_ymode_encodings, vp9_kf_ymode_tree);
-  vp9_tokens_from_tree(vp9_sb_ymode_encodings, vp9_sb_ymode_tree);
-  vp9_tokens_from_tree(vp9_sb_kf_ymode_encodings, vp9_sb_kf_ymode_tree);
-  vp9_tokens_from_tree(vp9_uv_mode_encodings,  vp9_uv_mode_tree);
-  vp9_tokens_from_tree(vp9_i8x8_mode_encodings,  vp9_i8x8_mode_tree);
-  vp9_tokens_from_tree(vp9_mbsplit_encodings, vp9_mbsplit_tree);
+  vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree);
   vp9_tokens_from_tree(vp9_switchable_interp_encodings,
                        vp9_switchable_interp_tree);
+  vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree);
 
-  vp9_tokens_from_tree_offset(vp9_mv_ref_encoding_array,
-                              vp9_mv_ref_tree, NEARESTMV);
   vp9_tokens_from_tree_offset(vp9_sb_mv_ref_encoding_array,
                               vp9_sb_mv_ref_tree, NEARESTMV);
-  vp9_tokens_from_tree_offset(vp9_sub_mv_ref_encoding_array,
-                              vp9_sub_mv_ref_tree, LEFT4X4);
 }
 
 void vp9_init_mode_contexts(VP9_COMMON *pc) {
-  vpx_memset(pc->fc.mv_ref_ct, 0, sizeof(pc->fc.mv_ref_ct));
-  vpx_memcpy(pc->fc.vp9_mode_contexts,
-             vp9_default_mode_contexts,
-             sizeof(vp9_default_mode_contexts));
+  vpx_memset(pc->fc.inter_mode_counts, 0, sizeof(pc->fc.inter_mode_counts));
+  vpx_memcpy(pc->fc.inter_mode_probs,
+             vp9_default_inter_mode_probs,
+             sizeof(vp9_default_inter_mode_probs));
 }
 
 void vp9_accum_mv_refs(VP9_COMMON *pc,
                        MB_PREDICTION_MODE m,
                        const int context) {
-  unsigned int (*mv_ref_ct)[4][2];
+  unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] =
+      pc->fc.inter_mode_counts;
 
-  mv_ref_ct = pc->fc.mv_ref_ct;
-
   if (m == ZEROMV) {
-    ++mv_ref_ct[context][0][0];
+    ++inter_mode_counts[context][0][0];
   } else {
-    ++mv_ref_ct[context][0][1];
+    ++inter_mode_counts[context][0][1];
     if (m == NEARESTMV) {
-      ++mv_ref_ct[context][1][0];
+      ++inter_mode_counts[context][1][0];
     } else {
-      ++mv_ref_ct[context][1][1];
+      ++inter_mode_counts[context][1][1];
       if (m == NEARMV) {
-        ++mv_ref_ct[context][2][0];
+        ++inter_mode_counts[context][2][0];
       } else {
-        ++mv_ref_ct[context][2][1];
-        if (m == NEWMV) {
-          ++mv_ref_ct[context][3][0];
-        } else {
-          ++mv_ref_ct[context][3][1];
-        }
+        ++inter_mode_counts[context][2][1];
       }
     }
   }
@@ -482,50 +292,35 @@
 #define MVREF_MAX_UPDATE_FACTOR 128
 void vp9_adapt_mode_context(VP9_COMMON *pc) {
   int i, j;
-  unsigned int (*mv_ref_ct)[4][2];
-  int (*mode_context)[4];
+  unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] =
+      pc->fc.inter_mode_counts;
+  vp9_prob (*mode_context)[VP9_INTER_MODES - 1] = pc->fc.inter_mode_probs;
 
-  mode_context = pc->fc.vp9_mode_contexts;
-
-  mv_ref_ct = pc->fc.mv_ref_ct;
-
   for (j = 0; j < INTER_MODE_CONTEXTS; j++) {
-    for (i = 0; i < 4; i++) {
-      int count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1], factor;
-
+    for (i = 0; i < VP9_INTER_MODES - 1; i++) {
+      int count = inter_mode_counts[j][i][0] + inter_mode_counts[j][i][1];
+      int factor;
       count = count > MVREF_COUNT_SAT ? MVREF_COUNT_SAT : count;
       factor = (MVREF_MAX_UPDATE_FACTOR * count / MVREF_COUNT_SAT);
-      mode_context[j][i] = weighted_prob(pc->fc.vp9_mode_contexts[j][i],
-                                         get_binary_prob(mv_ref_ct[j][i][0],
-                                                         mv_ref_ct[j][i][1]),
-                                         factor);
+      mode_context[j][i] = weighted_prob(
+          pc->fc.pre_inter_mode_probs[j][i],
+          get_binary_prob(inter_mode_counts[j][i][0],
+                          inter_mode_counts[j][i][1]),
+          factor);
     }
   }
 }
 
-#ifdef MODE_STATS
-#include "vp9/common/vp9_modecont.h"
-void print_mode_contexts(VP9_COMMON *pc) {
-  int j, i;
-  printf("\n====================\n");
-  for (j = 0; j < INTER_MODE_CONTEXTS; j++) {
-    for (i = 0; i < 4; i++) {
-      printf("%4d ", pc->fc.mode_context[j][i]);
-    }
-    printf("\n");
-  }
-  printf("====================\n");
-  for (j = 0; j < INTER_MODE_CONTEXTS; j++) {
-    for (i = 0; i < 4; i++) {
-      printf("%4d ", pc->fc.mode_context_a[j][i]);
-    }
-    printf("\n");
-  }
+#define MODE_COUNT_SAT 20
+#define MODE_MAX_UPDATE_FACTOR 128
+static int update_mode_ct(vp9_prob pre_prob, vp9_prob prob,
+                          unsigned int branch_ct[2]) {
+  int factor, count = branch_ct[0] + branch_ct[1];
+  count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+  factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+  return weighted_prob(pre_prob, prob, factor);
 }
-#endif
 
-#define MODE_COUNT_SAT 20
-#define MODE_MAX_UPDATE_FACTOR 144
 static void update_mode_probs(int n_modes,
                               const vp9_tree_index *tree, unsigned int *cnt,
                               vp9_prob *pre_probs, vp9_prob *dst_probs,
@@ -533,33 +328,37 @@
 #define MAX_PROBS 32
   vp9_prob probs[MAX_PROBS];
   unsigned int branch_ct[MAX_PROBS][2];
-  int t, count, factor;
+  int t;
 
   assert(n_modes - 1 < MAX_PROBS);
   vp9_tree_probs_from_distribution(tree, probs, branch_ct, cnt, tok0_offset);
-  for (t = 0; t < n_modes - 1; ++t) {
-    count = branch_ct[t][0] + branch_ct[t][1];
-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-    dst_probs[t] = weighted_prob(pre_probs[t], probs[t], factor);
-  }
+  for (t = 0; t < n_modes - 1; ++t)
+    dst_probs[t] = update_mode_ct(pre_probs[t], probs[t], branch_ct[t]);
 }
 
+static int update_mode_ct2(vp9_prob pre_prob, unsigned int branch_ct[2]) {
+  return update_mode_ct(pre_prob, get_binary_prob(branch_ct[0],
+                                                  branch_ct[1]), branch_ct);
+}
+
 // #define MODE_COUNT_TESTING
 void vp9_adapt_mode_probs(VP9_COMMON *cm) {
-  int i;
+  int i, j;
+  FRAME_CONTEXT *fc = &cm->fc;
 #ifdef MODE_COUNT_TESTING
   int t;
 
   printf("static const unsigned int\nymode_counts"
-         "[VP9_YMODES] = {\n");
-  for (t = 0; t < VP9_YMODES; ++t) printf("%d, ", cm->fc.ymode_counts[t]);
+         "[VP9_INTRA_MODES] = {\n");
+  for (t = 0; t < VP9_INTRA_MODES; ++t)
+    printf("%d, ", fc->ymode_counts[t]);
   printf("};\n");
   printf("static const unsigned int\nuv_mode_counts"
-         "[VP9_YMODES] [VP9_UV_MODES] = {\n");
-  for (i = 0; i < VP9_YMODES; ++i) {
+         "[VP9_INTRA_MODES] [VP9_INTRA_MODES] = {\n");
+  for (i = 0; i < VP9_INTRA_MODES; ++i) {
     printf("  {");
-    for (t = 0; t < VP9_UV_MODES; ++t) printf("%d, ", cm->fc.uv_mode_counts[i][t]);
+    for (t = 0; t < VP9_INTRA_MODES; ++t)
+      printf("%d, ", fc->uv_mode_counts[i][t]);
     printf("},\n");
   }
   printf("};\n");
@@ -566,71 +365,108 @@
   printf("static const unsigned int\nbmode_counts"
          "[VP9_NKF_BINTRAMODES] = {\n");
   for (t = 0; t < VP9_NKF_BINTRAMODES; ++t)
-    printf("%d, ", cm->fc.bmode_counts[t]);
+    printf("%d, ", fc->bmode_counts[t]);
   printf("};\n");
   printf("static const unsigned int\ni8x8_mode_counts"
          "[VP9_I8X8_MODES] = {\n");
-  for (t = 0; t < VP9_I8X8_MODES; ++t) printf("%d, ", cm->fc.i8x8_mode_counts[t]);
+  for (t = 0; t < VP9_I8X8_MODES; ++t)
+    printf("%d, ", fc->i8x8_mode_counts[t]);
   printf("};\n");
-  printf("static const unsigned int\nsub_mv_ref_counts"
-         "[SUBMVREF_COUNT] [VP9_SUBMVREFS] = {\n");
-  for (i = 0; i < SUBMVREF_COUNT; ++i) {
-    printf("  {");
-    for (t = 0; t < VP9_SUBMVREFS; ++t) printf("%d, ", cm->fc.sub_mv_ref_counts[i][t]);
-    printf("},\n");
-  }
-  printf("};\n");
   printf("static const unsigned int\nmbsplit_counts"
          "[VP9_NUMMBSPLITS] = {\n");
-  for (t = 0; t < VP9_NUMMBSPLITS; ++t) printf("%d, ", cm->fc.mbsplit_counts[t]);
+  for (t = 0; t < VP9_NUMMBSPLITS; ++t)
+    printf("%d, ", fc->mbsplit_counts[t]);
   printf("};\n");
-#if CONFIG_COMP_INTERINTRA_PRED
-  printf("static const unsigned int\ninterintra_counts"
-         "[2] = {\n");
-  for (t = 0; t < 2; ++t) printf("%d, ", cm->fc.interintra_counts[t]);
-  printf("};\n");
 #endif
-#endif
 
-  update_mode_probs(VP9_YMODES, vp9_ymode_tree,
-                    cm->fc.ymode_counts, cm->fc.pre_ymode_prob,
-                    cm->fc.ymode_prob, 0);
-  update_mode_probs(VP9_I32X32_MODES, vp9_sb_ymode_tree,
-                    cm->fc.sb_ymode_counts, cm->fc.pre_sb_ymode_prob,
-                    cm->fc.sb_ymode_prob, 0);
-  for (i = 0; i < VP9_YMODES; ++i) {
-    update_mode_probs(VP9_UV_MODES, vp9_uv_mode_tree,
-                      cm->fc.uv_mode_counts[i], cm->fc.pre_uv_mode_prob[i],
-                      cm->fc.uv_mode_prob[i], 0);
-  }
-  update_mode_probs(VP9_NKF_BINTRAMODES, vp9_bmode_tree,
-                    cm->fc.bmode_counts, cm->fc.pre_bmode_prob,
-                    cm->fc.bmode_prob, 0);
-  update_mode_probs(VP9_I8X8_MODES,
-                    vp9_i8x8_mode_tree, cm->fc.i8x8_mode_counts,
-                    cm->fc.pre_i8x8_mode_prob, cm->fc.i8x8_mode_prob, 0);
-  for (i = 0; i < SUBMVREF_COUNT; ++i) {
-    update_mode_probs(VP9_SUBMVREFS,
-                      vp9_sub_mv_ref_tree, cm->fc.sub_mv_ref_counts[i],
-                      cm->fc.pre_sub_mv_ref_prob[i], cm->fc.sub_mv_ref_prob[i],
-                      LEFT4X4);
-  }
-  update_mode_probs(VP9_NUMMBSPLITS, vp9_mbsplit_tree,
-                    cm->fc.mbsplit_counts, cm->fc.pre_mbsplit_prob,
-                    cm->fc.mbsplit_prob, 0);
-#if CONFIG_COMP_INTERINTRA_PRED
-  if (cm->use_interintra) {
-    int factor, interintra_prob, count;
+  for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+    fc->intra_inter_prob[i] = update_mode_ct2(fc->pre_intra_inter_prob[i],
+                                              fc->intra_inter_count[i]);
+  for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+    fc->comp_inter_prob[i] = update_mode_ct2(fc->pre_comp_inter_prob[i],
+                                             fc->comp_inter_count[i]);
+  for (i = 0; i < REF_CONTEXTS; i++)
+    fc->comp_ref_prob[i] = update_mode_ct2(fc->pre_comp_ref_prob[i],
+                                           fc->comp_ref_count[i]);
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      fc->single_ref_prob[i][j] = update_mode_ct2(fc->pre_single_ref_prob[i][j],
+                                                  fc->single_ref_count[i][j]);
 
-    interintra_prob = get_binary_prob(cm->fc.interintra_counts[0],
-                                      cm->fc.interintra_counts[1]);
-    count = cm->fc.interintra_counts[0] + cm->fc.interintra_counts[1];
-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-    cm->fc.interintra_prob = weighted_prob(cm->fc.pre_interintra_prob,
-                                           interintra_prob, factor);
+  for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
+    update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,
+                      fc->y_mode_counts[i], fc->pre_y_mode_prob[i],
+                      fc->y_mode_prob[i], 0);
+
+  for (i = 0; i < VP9_INTRA_MODES; ++i)
+    update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,
+                      fc->uv_mode_counts[i], fc->pre_uv_mode_prob[i],
+                      fc->uv_mode_prob[i], 0);
+
+  for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
+    update_mode_probs(PARTITION_TYPES, vp9_partition_tree,
+                      fc->partition_counts[i], fc->pre_partition_prob[i],
+                      fc->partition_prob[INTER_FRAME][i], 0);
+
+  if (cm->mcomp_filter_type == SWITCHABLE) {
+    for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) {
+      update_mode_probs(VP9_SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
+                        fc->switchable_interp_count[i],
+                        fc->pre_switchable_interp_prob[i],
+                        fc->switchable_interp_prob[i], 0);
+    }
   }
-#endif
+  if (cm->txfm_mode == TX_MODE_SELECT) {
+    int j;
+    unsigned int branch_ct_8x8p[TX_SIZE_MAX_SB - 3][2];
+    unsigned int branch_ct_16x16p[TX_SIZE_MAX_SB - 2][2];
+    unsigned int branch_ct_32x32p[TX_SIZE_MAX_SB - 1][2];
+    for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+      tx_counts_to_branch_counts_8x8(cm->fc.tx_count_8x8p[i],
+                                     branch_ct_8x8p);
+      for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j) {
+        int factor;
+        int count = branch_ct_8x8p[j][0] + branch_ct_8x8p[j][1];
+        vp9_prob prob = get_binary_prob(branch_ct_8x8p[j][0],
+                                        branch_ct_8x8p[j][1]);
+        count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+        factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+        cm->fc.tx_probs_8x8p[i][j] = weighted_prob(
+            cm->fc.pre_tx_probs_8x8p[i][j], prob, factor);
+      }
+    }
+    for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+      tx_counts_to_branch_counts_16x16(cm->fc.tx_count_16x16p[i],
+                                       branch_ct_16x16p);
+      for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j) {
+        int factor;
+        int count = branch_ct_16x16p[j][0] + branch_ct_16x16p[j][1];
+        vp9_prob prob = get_binary_prob(branch_ct_16x16p[j][0],
+                                        branch_ct_16x16p[j][1]);
+        count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+        factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+        cm->fc.tx_probs_16x16p[i][j] = weighted_prob(
+            cm->fc.pre_tx_probs_16x16p[i][j], prob, factor);
+      }
+    }
+    for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+      tx_counts_to_branch_counts_32x32(cm->fc.tx_count_32x32p[i],
+                                       branch_ct_32x32p);
+      for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j) {
+        int factor;
+        int count = branch_ct_32x32p[j][0] + branch_ct_32x32p[j][1];
+        vp9_prob prob = get_binary_prob(branch_ct_32x32p[j][0],
+                                        branch_ct_32x32p[j][1]);
+        count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+        factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+        cm->fc.tx_probs_32x32p[i][j] = weighted_prob(
+            cm->fc.pre_tx_probs_32x32p[i][j], prob, factor);
+      }
+    }
+  }
+  for (i = 0; i < MBSKIP_CONTEXTS; ++i)
+    fc->mbskip_probs[i] = update_mode_ct2(fc->pre_mbskip_probs[i],
+                                          fc->mbskip_count[i]);
 }
 
 static void set_default_lf_deltas(MACROBLOCKD *xd) {
@@ -637,15 +473,13 @@
   xd->mode_ref_lf_delta_enabled = 1;
   xd->mode_ref_lf_delta_update = 1;
 
-  xd->ref_lf_deltas[INTRA_FRAME] = 2;
+  xd->ref_lf_deltas[INTRA_FRAME] = 1;
   xd->ref_lf_deltas[LAST_FRAME] = 0;
-  xd->ref_lf_deltas[GOLDEN_FRAME] = -2;
-  xd->ref_lf_deltas[ALTREF_FRAME] = -2;
+  xd->ref_lf_deltas[GOLDEN_FRAME] = -1;
+  xd->ref_lf_deltas[ALTREF_FRAME] = -1;
 
-  xd->mode_lf_deltas[0] = 4;               // BPRED
-  xd->mode_lf_deltas[1] = -2;              // Zero
-  xd->mode_lf_deltas[2] = 2;               // New mv
-  xd->mode_lf_deltas[3] = 4;               // Split mv
+  xd->mode_lf_deltas[0] = 0;              // Zero
+  xd->mode_lf_deltas[1] = 0;               // New mv
 }
 
 void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
@@ -655,9 +489,9 @@
   vp9_clearall_segfeatures(xd);
   xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
   if (cm->last_frame_seg_map)
-    vpx_memset(cm->last_frame_seg_map, 0, (cm->mb_rows * cm->mb_cols));
+    vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
 
-  /* reset the mode ref deltas for loop filter */
+  // Reset the mode ref deltas for loop filter
   vpx_memset(xd->last_ref_lf_deltas, 0, sizeof(xd->last_ref_lf_deltas));
   vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->last_mode_lf_deltas));
   set_default_lf_deltas(xd);
@@ -664,33 +498,38 @@
 
   vp9_default_coef_probs(cm);
   vp9_init_mbmode_probs(cm);
-  vp9_default_bmode_probs(cm->fc.bmode_prob);
-  vp9_kf_default_bmode_probs(cm->kf_bmode_prob);
+  vpx_memcpy(cm->kf_y_mode_prob, vp9_kf_default_bmode_probs,
+             sizeof(vp9_kf_default_bmode_probs));
   vp9_init_mv_probs(cm);
+
   // To force update of the sharpness
   cm->last_sharpness_level = -1;
 
   vp9_init_mode_contexts(cm);
 
-  for (i = 0; i < NUM_FRAME_CONTEXTS; i++) {
-    vpx_memcpy(&cm->frame_contexts[i], &cm->fc, sizeof(cm->fc));
+  if ((cm->frame_type == KEY_FRAME) ||
+      cm->error_resilient_mode || (cm->reset_frame_context == 3)) {
+    // Reset all frame contexts.
+    for (i = 0; i < NUM_FRAME_CONTEXTS; ++i)
+      vpx_memcpy(&cm->frame_contexts[i], &cm->fc, sizeof(cm->fc));
+  } else if (cm->reset_frame_context == 2) {
+    // Reset only the frame context specified in the frame header.
+    vpx_memcpy(&cm->frame_contexts[cm->frame_context_idx], &cm->fc,
+               sizeof(cm->fc));
   }
 
   vpx_memset(cm->prev_mip, 0,
-             (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
+             cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));
   vpx_memset(cm->mip, 0,
-             (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
+             cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));
 
   vp9_update_mode_info_border(cm, cm->mip);
   vp9_update_mode_info_in_image(cm, cm->mi);
 
-#if CONFIG_NEW_MVREF
-  // Defaults probabilities for encoding the MV ref id signal
-  vpx_memset(xd->mb_mv_ref_probs, VP9_DEFAULT_MV_REF_PROB,
-             sizeof(xd->mb_mv_ref_probs));
-#endif
-  cm->ref_frame_sign_bias[GOLDEN_FRAME] = 0;
-  cm->ref_frame_sign_bias[ALTREF_FRAME] = 0;
+  vp9_update_mode_info_border(cm, cm->prev_mip);
+  vp9_update_mode_info_in_image(cm, cm->prev_mi);
+
+  vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias));
 
   cm->frame_context_idx = 0;
 }
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -15,61 +15,35 @@
 #include "vp9/common/vp9_treecoder.h"
 
 #define SUBMVREF_COUNT 5
-#define VP9_NUMMBSPLITS 4
+#define TX_SIZE_CONTEXTS 2
 
-#if CONFIG_COMP_INTERINTRA_PRED
-#define VP9_DEF_INTERINTRA_PROB 248
-#define VP9_UPD_INTERINTRA_PROB 192
-// whether to use a separate uv mode (1) or use the same as the y mode (0)
-#define SEPARATE_INTERINTRA_UV  0
-#endif
+#define VP9_MODE_UPDATE_PROB  252
 
-typedef const int vp9_mbsplit[16];
+// #define MODE_STATS
 
-extern vp9_mbsplit vp9_mbsplits[VP9_NUMMBSPLITS];
-
-extern const int vp9_mbsplit_count[VP9_NUMMBSPLITS];    /* # of subsets */
-
-extern const vp9_prob vp9_mbsplit_probs[VP9_NUMMBSPLITS - 1];
-
 extern int vp9_mv_cont(const int_mv *l, const int_mv *a);
 
-extern const vp9_prob vp9_sub_mv_ref_prob2[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
 
-extern const unsigned int vp9_kf_default_bmode_counts[VP9_KF_BINTRAMODES]
-                                                     [VP9_KF_BINTRAMODES]
-                                                     [VP9_KF_BINTRAMODES];
+extern const vp9_prob vp9_kf_default_bmode_probs[VP9_INTRA_MODES]
+                                                [VP9_INTRA_MODES]
+                                                [VP9_INTRA_MODES - 1];
 
-extern const vp9_tree_index vp9_bmode_tree[];
-extern const vp9_tree_index vp9_kf_bmode_tree[];
-
-extern const vp9_tree_index  vp9_ymode_tree[];
-extern const vp9_tree_index  vp9_kf_ymode_tree[];
-extern const vp9_tree_index  vp9_uv_mode_tree[];
-#define vp9_sb_ymode_tree vp9_uv_mode_tree
-#define vp9_sb_kf_ymode_tree vp9_uv_mode_tree
-extern const vp9_tree_index  vp9_i8x8_mode_tree[];
-extern const vp9_tree_index  vp9_mbsplit_tree[];
-extern const vp9_tree_index  vp9_mv_ref_tree[];
+extern const vp9_tree_index vp9_intra_mode_tree[];
 extern const vp9_tree_index  vp9_sb_mv_ref_tree[];
-extern const vp9_tree_index  vp9_sub_mv_ref_tree[];
 
-extern struct vp9_token_struct vp9_bmode_encodings[VP9_NKF_BINTRAMODES];
-extern struct vp9_token_struct vp9_kf_bmode_encodings[VP9_KF_BINTRAMODES];
-extern struct vp9_token_struct vp9_ymode_encodings[VP9_YMODES];
-extern struct vp9_token_struct vp9_sb_ymode_encodings[VP9_I32X32_MODES];
-extern struct vp9_token_struct vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES];
-extern struct vp9_token_struct vp9_kf_ymode_encodings[VP9_YMODES];
-extern struct vp9_token_struct vp9_i8x8_mode_encodings[VP9_I8X8_MODES];
-extern struct vp9_token_struct vp9_uv_mode_encodings[VP9_UV_MODES];
-extern struct vp9_token_struct vp9_mbsplit_encodings[VP9_NUMMBSPLITS];
+extern struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES];
 
 /* Inter mode values do not start at zero */
 
-extern struct vp9_token_struct vp9_mv_ref_encoding_array[VP9_MVREFS];
-extern struct vp9_token_struct vp9_sb_mv_ref_encoding_array[VP9_MVREFS];
-extern struct vp9_token_struct vp9_sub_mv_ref_encoding_array[VP9_SUBMVREFS];
+extern struct vp9_token vp9_sb_mv_ref_encoding_array[VP9_INTER_MODES];
 
+// probability models for partition information
+extern const vp9_tree_index  vp9_partition_tree[];
+extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
+extern const vp9_prob vp9_partition_probs[NUM_FRAME_TYPES]
+                                         [NUM_PARTITION_CONTEXTS]
+                                         [PARTITION_TYPES - 1];
+
 void vp9_entropy_mode_init(void);
 
 struct VP9Common;
@@ -87,12 +61,6 @@
                               MB_PREDICTION_MODE m,
                               const int context);
 
-void vp9_default_bmode_probs(vp9_prob dest[VP9_NKF_BINTRAMODES - 1]);
-
-void vp9_kf_default_bmode_probs(vp9_prob dest[VP9_KF_BINTRAMODES]
-                                             [VP9_KF_BINTRAMODES]
-                                             [VP9_KF_BINTRAMODES - 1]);
-
 void vp9_adapt_mode_probs(struct VP9Common *);
 
 #define VP9_SWITCHABLE_FILTERS 3 /* number of switchable filters */
@@ -107,10 +75,22 @@
 extern const  vp9_tree_index vp9_switchable_interp_tree
                   [2 * (VP9_SWITCHABLE_FILTERS - 1)];
 
-extern struct vp9_token_struct vp9_switchable_interp_encodings
-                  [VP9_SWITCHABLE_FILTERS];
+extern struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
 
 extern const  vp9_prob vp9_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
                                                  [VP9_SWITCHABLE_FILTERS - 1];
 
+extern const vp9_prob vp9_default_tx_probs_32x32p[TX_SIZE_CONTEXTS]
+                                                 [TX_SIZE_MAX_SB - 1];
+extern const vp9_prob vp9_default_tx_probs_16x16p[TX_SIZE_CONTEXTS]
+                                                 [TX_SIZE_MAX_SB - 2];
+extern const vp9_prob vp9_default_tx_probs_8x8p[TX_SIZE_CONTEXTS]
+                                               [TX_SIZE_MAX_SB - 3];
+
+extern void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
+                                             unsigned int (*ct_32x32p)[2]);
+extern void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
+                                             unsigned int (*ct_16x16p)[2]);
+extern void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p,
+                                           unsigned int (*ct_8x8p)[2]);
 #endif  // VP9_COMMON_VP9_ENTROPYMODE_H_
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -14,16 +14,11 @@
 
 //#define MV_COUNT_TESTING
 
-#define MV_COUNT_SAT 16
-#define MV_MAX_UPDATE_FACTOR 160
+#define MV_COUNT_SAT 20
+#define MV_MAX_UPDATE_FACTOR 128
 
-#if CONFIG_NEW_MVREF
 /* Integer pel reference mv threshold for use of high-precision 1/8 mv */
-#define COMPANDED_MVREF_THRESH    1000000
-#else
-/* Integer pel reference mv threshold for use of high-precision 1/8 mv */
 #define COMPANDED_MVREF_THRESH    8
-#endif
 
 /* Smooth or bias the mv-counts before prob computation */
 /* #define SMOOTH_MV_COUNTS */
@@ -33,7 +28,7 @@
   -MV_JOINT_HNZVZ, 4,
   -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
 };
-struct vp9_token_struct vp9_mv_joint_encodings[MV_JOINTS];
+struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
 
 const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = {
   -MV_CLASS_0, 2,
@@ -47,12 +42,12 @@
   -MV_CLASS_7, -MV_CLASS_8,
   -MV_CLASS_9, -MV_CLASS_10,
 };
-struct vp9_token_struct vp9_mv_class_encodings[MV_CLASSES];
+struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
 
 const vp9_tree_index vp9_mv_class0_tree [2 * CLASS0_SIZE - 2] = {
   -0, -1,
 };
-struct vp9_token_struct vp9_mv_class0_encodings[CLASS0_SIZE];
+struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
 
 const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = {
   -0, 2,
@@ -59,7 +54,7 @@
   -1, 4,
   -2, -3
 };
-struct vp9_token_struct vp9_mv_fp_encodings[4];
+struct vp9_token vp9_mv_fp_encodings[4];
 
 const nmv_context vp9_default_nmv_context = {
   {32, 64, 96},
@@ -87,11 +82,15 @@
   },
 };
 
-MV_JOINT_TYPE vp9_get_mv_joint(MV mv) {
-  if (mv.row == 0 && mv.col == 0) return MV_JOINT_ZERO;
-  else if (mv.row == 0 && mv.col != 0) return MV_JOINT_HNZVZ;
-  else if (mv.row != 0 && mv.col == 0) return MV_JOINT_HZVNZ;
-  else return MV_JOINT_HNZVNZ;
+MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv) {
+  if (mv->row == 0 && mv->col == 0)
+    return MV_JOINT_ZERO;
+  else if (mv->row == 0 && mv->col != 0)
+    return MV_JOINT_HNZVZ;
+  else if (mv->row != 0 && mv->col == 0)
+    return MV_JOINT_HZVNZ;
+  else
+    return MV_JOINT_HNZVNZ;
 }
 
 #define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0)
@@ -137,7 +136,8 @@
                                     int incr,
                                     int usehp) {
   int s, z, c, o, d, e, f;
-  if (!incr) return;
+  if (!incr)
+    return;
   assert (v != 0);            /* should not be zero */
   s = v < 0;
   mvcomp->sign[s] += incr;
@@ -152,8 +152,8 @@
   if (c == MV_CLASS_0) {
     mvcomp->class0[d] += incr;
   } else {
-    int i, b;
-    b = c + CLASS0_BITS - 1;  /* number of bits */
+    int i;
+    int b = c + CLASS0_BITS - 1;  // number of bits
     for (i = 0; i < b; ++i)
       mvcomp->bits[i][((d >> i) & 1)] += incr;
   }
@@ -204,25 +204,22 @@
 
 void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
                        int usehp) {
-  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
+  const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
   mvctx->joints[j]++;
   usehp = usehp && vp9_use_nmv_hp(ref);
-  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
+  if (mv_joint_vertical(j))
     increment_nmv_component_count(mv->row, &mvctx->comps[0], 1, usehp);
-  }
-  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
+
+  if (mv_joint_horizontal(j))
     increment_nmv_component_count(mv->col, &mvctx->comps[1], 1, usehp);
-  }
 }
 
-static void adapt_prob(vp9_prob *dest, vp9_prob prep,
-                       unsigned int ct[2]) {
-  int count = ct[0] + ct[1];
+static void adapt_prob(vp9_prob *dest, vp9_prob prep, unsigned int ct[2]) {
+  const int count = MIN(ct[0] + ct[1], MV_COUNT_SAT);
   if (count) {
-    vp9_prob newp = get_binary_prob(ct[0], ct[1]);
-    count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count;
-    *dest = weighted_prob(prep, newp,
-                          MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);
+    const vp9_prob newp = get_binary_prob(ct[0], ct[1]);
+    const int factor = MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT;
+    *dest = weighted_prob(prep, newp, factor);
   } else {
     *dest = prep;
   }
@@ -253,10 +250,12 @@
                                    branch_ct_joint,
                                    nmv_count->joints, 0);
   for (i = 0; i < 2; ++i) {
-    prob->comps[i].sign = get_binary_prob(nmv_count->comps[i].sign[0],
-                                          nmv_count->comps[i].sign[1]);
-    branch_ct_sign[i][0] = nmv_count->comps[i].sign[0];
-    branch_ct_sign[i][1] = nmv_count->comps[i].sign[1];
+    const uint32_t s0 = nmv_count->comps[i].sign[0];
+    const uint32_t s1 = nmv_count->comps[i].sign[1];
+
+    prob->comps[i].sign = get_binary_prob(s0, s1);
+    branch_ct_sign[i][0] = s0;
+    branch_ct_sign[i][1] = s1;
     vp9_tree_probs_from_distribution(vp9_mv_class_tree,
                                      prob->comps[i].classes,
                                      branch_ct_classes[i],
@@ -266,10 +265,12 @@
                                      branch_ct_class0[i],
                                      nmv_count->comps[i].class0, 0);
     for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      prob->comps[i].bits[j] = get_binary_prob(nmv_count->comps[i].bits[j][0],
-                                               nmv_count->comps[i].bits[j][1]);
-      branch_ct_bits[i][j][0] = nmv_count->comps[i].bits[j][0];
-      branch_ct_bits[i][j][1] = nmv_count->comps[i].bits[j][1];
+      const uint32_t b0 = nmv_count->comps[i].bits[j][0];
+      const uint32_t b1 = nmv_count->comps[i].bits[j][1];
+
+      prob->comps[i].bits[j] = get_binary_prob(b0, b1);
+      branch_ct_bits[i][j][0] = b0;
+      branch_ct_bits[i][j][1] = b1;
     }
   }
   for (i = 0; i < 2; ++i) {
@@ -286,16 +287,18 @@
   }
   if (usehp) {
     for (i = 0; i < 2; ++i) {
-      prob->comps[i].class0_hp =
-          get_binary_prob(nmv_count->comps[i].class0_hp[0],
-                          nmv_count->comps[i].class0_hp[1]);
-      branch_ct_class0_hp[i][0] = nmv_count->comps[i].class0_hp[0];
-      branch_ct_class0_hp[i][1] = nmv_count->comps[i].class0_hp[1];
+      const uint32_t c0_hp0 = nmv_count->comps[i].class0_hp[0];
+      const uint32_t c0_hp1 = nmv_count->comps[i].class0_hp[1];
+      const uint32_t hp0 = nmv_count->comps[i].hp[0];
+      const uint32_t hp1 = nmv_count->comps[i].hp[1];
 
-      prob->comps[i].hp = get_binary_prob(nmv_count->comps[i].hp[0],
-                                          nmv_count->comps[i].hp[1]);
-      branch_ct_hp[i][0] = nmv_count->comps[i].hp[0];
-      branch_ct_hp[i][1] = nmv_count->comps[i].hp[1];
+      prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1);
+      branch_ct_class0_hp[i][0] = c0_hp0;
+      branch_ct_class0_hp[i][1] = c0_hp1;
+
+      prob->comps[i].hp = get_binary_prob(hp0, hp1);
+      branch_ct_hp[i][0] = hp0;
+      branch_ct_hp[i][1] = hp1;
     }
   }
 }
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -24,14 +24,8 @@
 void vp9_adapt_nmv_probs(struct VP9Common *cm, int usehp);
 int vp9_use_nmv_hp(const MV *ref);
 
-#define VP9_NMV_UPDATE_PROB  255
+#define VP9_NMV_UPDATE_PROB  252
 
-#if CONFIG_NEW_MVREF
-#define VP9_MVREF_UPDATE_PROB 252
-#define VP9_DEFAULT_MV_REF_PROB 192
-#define VP9_MV_REF_UPDATE_COST (14 << 8)
-#endif
-
 //#define MV_GROUP_UPDATE
 
 #define LOW_PRECISION_MV_UPDATE  /* Use 7 bit forward update */
@@ -45,8 +39,16 @@
   MV_JOINT_HNZVNZ = 3,           /* Both components nonzero */
 } MV_JOINT_TYPE;
 
+static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) {
+  return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ;
+}
+
+static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) {
+  return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ;
+}
+
 extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2];
-extern struct vp9_token_struct vp9_mv_joint_encodings [MV_JOINTS];
+extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
 
 /* Symbols for coding magnitude class of nonzero components */
 #define MV_CLASSES     11
@@ -65,7 +67,7 @@
 } MV_CLASS_TYPE;
 
 extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2];
-extern struct vp9_token_struct vp9_mv_class_encodings [MV_CLASSES];
+extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
 
 #define CLASS0_BITS    1  /* bits at integer precision for class 0 */
 #define CLASS0_SIZE    (1 << CLASS0_BITS)
@@ -76,10 +78,10 @@
 #define MV_VALS        ((MV_MAX << 1) + 1)
 
 extern const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2];
-extern struct vp9_token_struct vp9_mv_class0_encodings[CLASS0_SIZE];
+extern struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
 
 extern const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2];
-extern struct vp9_token_struct vp9_mv_fp_encodings[4];
+extern struct vp9_token vp9_mv_fp_encodings[4];
 
 typedef struct {
   vp9_prob sign;
@@ -97,7 +99,7 @@
   nmv_component comps[2];
 } nmv_context;
 
-MV_JOINT_TYPE vp9_get_mv_joint(MV mv);
+MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv);
 MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset);
 int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset);
 
--- /dev/null
+++ b/vp9/common/vp9_enums.h
@@ -1,0 +1,49 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_ENUMS_H_
+#define VP9_COMMON_VP9_ENUMS_H_
+
+#include "./vpx_config.h"
+
+#define LOG2_MI_SIZE 3
+
+#define MI_SIZE (1 << LOG2_MI_SIZE)
+#define MI_MASK ((64 >> LOG2_MI_SIZE) - 1)
+
+typedef enum BLOCK_SIZE_TYPE {
+  BLOCK_SIZE_AB4X4,
+  BLOCK_SIZE_SB4X8,
+  BLOCK_SIZE_SB8X4,
+  BLOCK_SIZE_SB8X8,
+  BLOCK_SIZE_SB8X16,
+  BLOCK_SIZE_SB16X8,
+  BLOCK_SIZE_MB16X16,
+  BLOCK_SIZE_SB16X32,
+  BLOCK_SIZE_SB32X16,
+  BLOCK_SIZE_SB32X32,
+  BLOCK_SIZE_SB32X64,
+  BLOCK_SIZE_SB64X32,
+  BLOCK_SIZE_SB64X64,
+  BLOCK_SIZE_TYPES
+} BLOCK_SIZE_TYPE;
+
+typedef enum PARTITION_TYPE {
+  PARTITION_NONE,
+  PARTITION_HORZ,
+  PARTITION_VERT,
+  PARTITION_SPLIT,
+  PARTITION_TYPES
+} PARTITION_TYPE;
+
+#define PARTITION_PLOFFSET   4  // number of probability models per block size
+#define NUM_PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
+
+#endif  // VP9_COMMON_VP9_ENUMS_H_
--- a/vp9/common/vp9_extend.c
+++ b/vp9/common/vp9_extend.c
@@ -60,11 +60,23 @@
   const int eb_y = dst->border + dst->y_height - src->y_height;
   const int er_y = dst->border + dst->y_width - src->y_width;
 
-  const int et_uv = dst->border >> 1;
-  const int el_uv = dst->border >> 1;
-  const int eb_uv = (dst->border >> 1) + dst->uv_height - src->uv_height;
-  const int er_uv = (dst->border >> 1) + dst->uv_width - src->uv_width;
+  const int et_uv = dst->border >> (dst->uv_height != dst->y_height);
+  const int el_uv = dst->border >> (dst->uv_width != dst->y_width);
+  const int eb_uv = et_uv + dst->uv_height - src->uv_height;
+  const int er_uv = el_uv + dst->uv_width - src->uv_width;
 
+#if CONFIG_ALPHA
+  const int et_a = dst->border >> (dst->alpha_height != dst->y_height);
+  const int el_a = dst->border >> (dst->alpha_width != dst->y_width);
+  const int eb_a = et_a + dst->alpha_height - src->alpha_height;
+  const int er_a = el_a + dst->alpha_width - src->alpha_width;
+
+  copy_and_extend_plane(src->alpha_buffer, src->alpha_stride,
+                        dst->alpha_buffer, dst->alpha_stride,
+                        src->alpha_width, src->alpha_height,
+                        et_a, el_a, eb_a, er_a);
+#endif
+
   copy_and_extend_plane(src->y_buffer, src->y_stride,
                         dst->y_buffer, dst->y_stride,
                         src->y_width, src->y_height,
@@ -78,7 +90,7 @@
   copy_and_extend_plane(src->v_buffer, src->uv_stride,
                         dst->v_buffer, dst->uv_stride,
                         src->uv_width, src->uv_height,
-                        et_y, el_y, eb_uv, er_uv);
+                        et_uv, el_uv, eb_uv, er_uv);
 }
 
 void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
@@ -118,30 +130,4 @@
                         dst->v_buffer + dst_uv_offset, dst->uv_stride,
                         srcw_uv, srch_uv,
                         et_uv, el_uv, eb_uv, er_uv);
-}
-
-// note the extension is only for the last row, for intra prediction purpose
-void vp9_extend_mb_row(YV12_BUFFER_CONFIG *buf,
-                       uint8_t *y, uint8_t *u, uint8_t *v) {
-  int i;
-
-  y += buf->y_stride * 14;
-  u += buf->uv_stride * 6;
-  v += buf->uv_stride * 6;
-
-  for (i = 0; i < 4; i++) {
-    y[i] = y[-1];
-    u[i] = u[-1];
-    v[i] = v[-1];
-  }
-
-  y += buf->y_stride;
-  u += buf->uv_stride;
-  v += buf->uv_stride;
-
-  for (i = 0; i < 4; i++) {
-    y[i] = y[-1];
-    u[i] = u[-1];
-    v[i] = v[-1];
-  }
 }
--- a/vp9/common/vp9_extend.h
+++ b/vp9/common/vp9_extend.h
@@ -22,9 +22,4 @@
                                          YV12_BUFFER_CONFIG *dst,
                                          int srcy, int srcx,
                                          int srch, int srcw);
-
-void vp9_extend_mb_row(YV12_BUFFER_CONFIG *buf,
-                       uint8_t *y, uint8_t *u, uint8_t *v);
-
-
 #endif  // VP9_COMMON_VP9_EXTEND_H_
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c
@@ -34,12 +34,7 @@
   { 0, 0, 0,   8, 120, 0, 0, 0 }
 };
 
-#define FILTER_ALPHA        0
-#define FILTER_ALPHA_SHARP  0
-#define FILTER_ALPHA_SMOOTH 50
-DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8])
-    = {
-#if FILTER_ALPHA == 0
+DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
   /* Lagrangian interpolation filter */
   { 0,   0,   0, 128,   0,   0,   0,  0},
   { 0,   1,  -5, 126,   8,  -3,   1,  0},
@@ -57,38 +52,10 @@
   { -1,   3,  -9,  27, 118, -13,   4, -1},
   { 0,   2,  -6,  18, 122, -10,   3, -1},
   { 0,   1,  -3,   8, 126,  -5,   1,  0}
-
-#elif FILTER_ALPHA == 50
-  /* Generated using MATLAB:
-   * alpha = 0.5;
-   * b=intfilt(8,4,alpha);
-   * bi=round(128*b);
-   * ba=flipud(reshape([bi 0], 8, 8));
-   * disp(num2str(ba, '%d,'))
-   */
-  { 0,   0,   0, 128,   0,   0,   0,  0},
-  { 0,   1,  -5, 126,   8,  -3,   1,  0},
-  { 0,   2, -10, 122,  18,  -6,   2,  0},
-  { -1,   3, -13, 118,  27,  -9,   3,  0},
-  { -1,   4, -16, 112,  37, -11,   3,  0},
-  { -1,   5, -17, 104,  48, -14,   4, -1},
-  { -1,   5, -18,  96,  58, -16,   5, -1},
-  { -1,   5, -19,  88,  68, -17,   5, -1},
-  { -1,   5, -18,  78,  78, -18,   5, -1},
-  { -1,   5, -17,  68,  88, -19,   5, -1},
-  { -1,   5, -16,  58,  96, -18,   5, -1},
-  { -1,   4, -14,  48, 104, -17,   5, -1},
-  { 0,   3, -11,  37, 112, -16,   4, -1},
-  { 0,   3,  -9,  27, 118, -13,   3, -1},
-  { 0,   2,  -6,  18, 122, -10,   2,  0},
-  { 0,   1,  -3,   8, 126,  -5,   1,  0}
-
-#endif  /* FILTER_ALPHA */
 };
 
 DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8])
     = {
-#if FILTER_ALPHA_SHARP == 0
   /* dct based filter */
   {0,   0,   0, 128,   0,   0,   0, 0},
   {-1,   3,  -7, 127,   8,  -3,   1, 0},
@@ -106,88 +73,25 @@
   {-2,   5, -10,  27, 121, -17,   7, -3},
   {-1,   3,  -6,  17, 125, -13,   5, -2},
   {0,   1,  -3,   8, 127,  -7,   3, -1}
-
-#elif FILTER_ALPHA_SHARP == 80
-  /* alpha = 0.80 */
-  { 0,   0,   0, 128,   0,   0,   0,  0},
-  {-1,   2,  -6, 127,   9,  -4,   2, -1},
-  {-2,   5, -12, 124,  18,  -7,   4, -2},
-  {-2,   7, -16, 119,  28, -11,   5, -2},
-  {-3,   8, -19, 114,  38, -14,   7, -3},
-  {-3,   9, -22, 107,  49, -17,   8, -3},
-  {-4,  10, -23,  99,  60, -20,  10, -4},
-  {-4,  11, -23,  90,  70, -22,  10, -4},
-  {-4,  11, -23,  80,  80, -23,  11, -4},
-  {-4,  10, -22,  70,  90, -23,  11, -4},
-  {-4,  10, -20,  60,  99, -23,  10, -4},
-  {-3,   8, -17,  49, 107, -22,   9, -3},
-  {-3,   7, -14,  38, 114, -19,   8, -3},
-  {-2,   5, -11,  28, 119, -16,   7, -2},
-  {-2,   4,  -7,  18, 124, -12,   5, -2},
-  {-1,   2,  -4,   9, 127,  -6,   2, -1}
-#endif  /* FILTER_ALPHA_SHARP */
 };
 
 DECLARE_ALIGNED(256, const int16_t,
                 vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8]) = {
-  /* 8-tap lowpass filter */
-  /* Hamming window */
-  /* freqmultiplier = 0.625 */
-#if FILTER_ALPHA_SMOOTH == 625
-  {-1, -7, 32, 80, 32, -7, -1,  0},
-  {-1, -8, 28, 80, 37, -7, -2,  1},
-  { 0, -8, 24, 79, 41, -7, -2,  1},
-  { 0, -8, 20, 78, 45, -5, -3,  1},
-  { 0, -8, 16, 76, 50, -4, -3,  1},
-  { 0, -7, 13, 74, 54, -3, -4,  1},
-  { 1, -7,  9, 71, 58, -1, -4,  1},
-  { 1, -6,  6, 68, 62,  1, -5,  1},
-  { 1, -6,  4, 65, 65,  4, -6,  1},
-  { 1, -5,  1, 62, 68,  6, -6,  1},
-  { 1, -4, -1, 58, 71,  9, -7,  1},
-  { 1, -4, -3, 54, 74, 13, -7,  0},
-  { 1, -3, -4, 50, 76, 16, -8,  0},
-  { 1, -3, -5, 45, 78, 20, -8,  0},
-  { 1, -2, -7, 41, 79, 24, -8,  0},
-  { 1, -2, -7, 37, 80, 28, -8, -1}
-
-#elif FILTER_ALPHA_SMOOTH == 50
   /* freqmultiplier = 0.5 */
-  {-3,  0, 35, 64, 35,  0, -3, 0},
-  {-3, -1, 32, 64, 38,  1, -3, 0},
-  {-2, -2, 29, 63, 41,  2, -3, 0},
-  {-2, -2, 26, 63, 43,  4, -4, 0},
-  {-2, -3, 24, 62, 46,  5, -4, 0},
-  {-2, -3, 21, 60, 49,  7, -4, 0},
-  {-1, -4, 18, 59, 51,  9, -4, 0},
-  {-1, -4, 16, 57, 53, 12, -4, -1},
-  {-1, -4, 14, 55, 55, 14, -4, -1},
-  {-1, -4, 12, 53, 57, 16, -4, -1},
-  {0, -4,  9, 51, 59, 18, -4, -1},
-  {0, -4,  7, 49, 60, 21, -3, -2},
-  {0, -4,  5, 46, 62, 24, -3, -2},
-  {0, -4,  4, 43, 63, 26, -2, -2},
-  {0, -3,  2, 41, 63, 29, -2, -2},
-  {0, -3,  1, 38, 64, 32, -1, -3}
-#endif
-};
-
-DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8])
-    = {
-  {0, 0,   0, 128,   0,   0, 0,  0},
-  {0, 1,  -5, 125,   8,  -2, 1,  0},
-  {0, 1,  -8, 122,  17,  -5, 1,  0},
-  {0, 2, -11, 116,  27,  -8, 2,  0},
-  {0, 3, -14, 110,  37, -10, 2,  0},
-  {0, 3, -15, 103,  47, -12, 2,  0},
-  {0, 3, -16,  95,  57, -14, 3,  0},
-  {0, 3, -16,  86,  67, -15, 3,  0},
-  {0, 3, -16,  77,  77, -16, 3,  0},
-  {0, 3, -15,  67,  86, -16, 3,  0},
-  {0, 3, -14,  57,  95, -16, 3,  0},
-  {0, 2, -12,  47, 103, -15, 3,  0},
-  {0, 2, -10,  37, 110, -14, 3,  0},
-  {0, 2,  -8,  27, 116, -11, 2,  0},
-  {0, 1,  -5,  17, 122,  -8, 1,  0},
-  {0, 1,  -2,   8, 125,  -5, 1,  0}
+  { 0,  0,  0, 128,  0,  0,  0,  0},
+  {-3, -1, 32,  64, 38,  1, -3,  0},
+  {-2, -2, 29,  63, 41,  2, -3,  0},
+  {-2, -2, 26,  63, 43,  4, -4,  0},
+  {-2, -3, 24,  62, 46,  5, -4,  0},
+  {-2, -3, 21,  60, 49,  7, -4,  0},
+  {-1, -4, 18,  59, 51,  9, -4,  0},
+  {-1, -4, 16,  57, 53, 12, -4, -1},
+  {-1, -4, 14,  55, 55, 14, -4, -1},
+  {-1, -4, 12,  53, 57, 16, -4, -1},
+  { 0, -4,  9,  51, 59, 18, -4, -1},
+  { 0, -4,  7,  49, 60, 21, -3, -2},
+  { 0, -4,  5,  46, 62, 24, -3, -2},
+  { 0, -4,  4,  43, 63, 26, -2, -2},
+  { 0, -3,  2,  41, 63, 29, -2, -2},
+  { 0, -3,  1,  38, 64, 32, -1, -3}
 };
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -8,22 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include <limits.h>
 
 #include "vp9/common/vp9_findnearmv.h"
+#include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_sadmxn.h"
 #include "vp9/common/vp9_subpelvar.h"
 
-const uint8_t vp9_mbsplit_offset[4][16] = {
-  { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-  { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-  { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
-};
-
-static void lower_mv_precision(int_mv *mv, int usehp)
-{
+static void lower_mv_precision(int_mv *mv, int usehp) {
   if (!usehp || !vp9_use_nmv_hp(&mv->as_mv)) {
     if (mv->as_mv.row & 1)
       mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1);
@@ -32,288 +24,73 @@
   }
 }
 
-vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,
-                           vp9_prob p[4], const int context) {
-  p[0] = pc->fc.vp9_mode_contexts[context][0];
-  p[1] = pc->fc.vp9_mode_contexts[context][1];
-  p[2] = pc->fc.vp9_mode_contexts[context][2];
-  p[3] = pc->fc.vp9_mode_contexts[context][3];
+vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc, vp9_prob *p, int context) {
+  p[0] = pc->fc.inter_mode_probs[context][0];
+  p[1] = pc->fc.inter_mode_probs[context][1];
+  p[2] = pc->fc.inter_mode_probs[context][2];
   return p;
 }
 
-#define SP(x) (((x) & 7) << 1)
-unsigned int vp9_sad3x16_c(const uint8_t *src_ptr,
-                           int  src_stride,
-                           const uint8_t *ref_ptr,
-                           int  ref_stride) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 3, 16);
-}
-unsigned int vp9_sad16x3_c(const uint8_t *src_ptr,
-                           int  src_stride,
-                           const uint8_t *ref_ptr,
-                           int  ref_stride) {
-  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 3);
-}
-
-
-unsigned int vp9_variance2x16_c(const uint8_t *src_ptr,
-                                int  source_stride,
-                                const uint8_t *ref_ptr,
-                                int  recon_stride,
-                                unsigned int *sse) {
-  int sum;
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 2, 16, sse, &sum);
-  return (*sse - (((unsigned int)sum * sum) >> 5));
-}
-
-unsigned int vp9_variance16x2_c(const uint8_t *src_ptr,
-                                int  source_stride,
-                                const uint8_t *ref_ptr,
-                                int  recon_stride,
-                                unsigned int *sse) {
-  int sum;
-  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 2, sse, &sum);
-  return (*sse - (((unsigned int)sum * sum) >> 5));
-}
-
-unsigned int vp9_sub_pixel_variance16x2_c(const uint8_t *src_ptr,
-                                          int  source_stride,
-                                          int  xoffset,
-                                          int  yoffset,
-                                          const uint8_t *ref_ptr,
-                                          int ref_stride,
-                                          unsigned int *sse) {
-  uint16_t FData3[16 * 3];  // Temp data buffer used in filtering
-  uint8_t temp2[2 * 16];
-  const int16_t *HFilter, *VFilter;
-
-  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, FData3,
-                                    source_stride, 1, 3, 16, HFilter);
-  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 2, 16, VFilter);
-
-  return vp9_variance16x2_c(temp2, 16, ref_ptr, ref_stride, sse);
-}
-
-unsigned int vp9_sub_pixel_variance2x16_c(const uint8_t *src_ptr,
-                                          int  source_stride,
-                                          int  xoffset,
-                                          int  yoffset,
-                                          const uint8_t *ref_ptr,
-                                          int ref_stride,
-                                          unsigned int *sse) {
-  uint16_t FData3[2 * 17];  // Temp data buffer used in filtering
-  uint8_t temp2[2 * 16];
-  const int16_t *HFilter, *VFilter;
-
-  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
-
-  var_filter_block2d_bil_first_pass(src_ptr, FData3,
-                                    source_stride, 1, 17, 2, HFilter);
-  var_filter_block2d_bil_second_pass(FData3, temp2, 2, 2, 16, 2, VFilter);
-
-  return vp9_variance2x16_c(temp2, 2, ref_ptr, ref_stride, sse);
-}
-
-#if CONFIG_USESELECTREFMV
-/* check a list of motion vectors by sad score using a number rows of pixels
- * above and a number cols of pixels in the left to select the one with best
- * score to use as ref motion vector
- */
-
 void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
-                           uint8_t *ref_y_buffer,
-                           int ref_y_stride,
                            int_mv *mvlist,
                            int_mv *nearest,
                            int_mv *near) {
-  int i, j;
-  uint8_t *above_src;
-  uint8_t *above_ref;
-#if !CONFIG_ABOVESPREFMV
-  uint8_t *left_src;
-  uint8_t *left_ref;
-#endif
-  unsigned int score;
-  unsigned int sse;
-  unsigned int ref_scores[MAX_MV_REF_CANDIDATES] = {0};
-  int_mv sorted_mvs[MAX_MV_REF_CANDIDATES];
-  int zero_seen = FALSE;
+  int i;
+  // Make sure all the candidates are properly clamped etc
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
+    lower_mv_precision(&mvlist[i], xd->allow_high_precision_mv);
+    clamp_mv2(&mvlist[i], xd);
+  }
+  *nearest = mvlist[0];
+  *near = mvlist[1];
+}
 
-  if (ref_y_buffer) {
+void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                   int_mv *dst_nearest,
+                                   int_mv *dst_near,
+                                   int block_idx, int ref_idx) {
+  int_mv dst_list[MAX_MV_REF_CANDIDATES];
+  int_mv mv_list[MAX_MV_REF_CANDIDATES];
+  MODE_INFO *mi = xd->mode_info_context;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
 
-    // Default all to 0,0 if nothing else available
-    nearest->as_int = near->as_int = 0;
-    vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs));
+  assert(ref_idx == 0 || ref_idx == 1);
+  assert(MAX_MV_REF_CANDIDATES == 2);  // makes code here slightly easier
 
-    above_src = xd->dst.y_buffer - xd->dst.y_stride * 2;
-    above_ref = ref_y_buffer - ref_y_stride * 2;
-#if CONFIG_ABOVESPREFMV
-    above_src -= 4;
-    above_ref -= 4;
-#else
-    left_src  = xd->dst.y_buffer - 2;
-    left_ref  = ref_y_buffer - 2;
-#endif
+  vp9_find_mv_refs_idx(cm, xd, xd->mode_info_context,
+                       xd->prev_mode_info_context,
+                       mbmi->ref_frame[ref_idx],
+                       mv_list, cm->ref_frame_sign_bias, block_idx);
 
-    // Limit search to the predicted best few candidates
-    for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
-      int_mv this_mv;
-      int offset = 0;
-      int row_offset, col_offset;
+  dst_list[1].as_int = 0;
+  if (block_idx == 0) {
+    memcpy(dst_list, mv_list, MAX_MV_REF_CANDIDATES * sizeof(int_mv));
+  } else if (block_idx == 1 || block_idx == 2) {
+    int dst = 0, n;
+    union b_mode_info *bmi = mi->bmi;
 
-      this_mv.as_int = mvlist[i].as_int;
-
-      // If we see a 0,0 vector for a second time we have reached the end of
-      // the list of valid candidate vectors.
-      if (!this_mv.as_int && zero_seen)
-        break;
-
-      zero_seen = zero_seen || !this_mv.as_int;
-
-#if !CONFIG_ABOVESPREFMV
-      clamp_mv(&this_mv,
-               xd->mb_to_left_edge - LEFT_TOP_MARGIN + 24,
-               xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
-               xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,
-               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
-#else
-      clamp_mv(&this_mv,
-               xd->mb_to_left_edge - LEFT_TOP_MARGIN + 32,
-               xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
-               xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,
-               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
-#endif
-
-      row_offset = this_mv.as_mv.row >> 3;
-      col_offset = this_mv.as_mv.col >> 3;
-      offset = ref_y_stride * row_offset + col_offset;
-      score = 0;
-#if !CONFIG_ABOVESPREFMV
-      if (xd->up_available) {
-#else
-      if (xd->up_available && xd->left_available) {
-#endif
-        vp9_sub_pixel_variance16x2(above_ref + offset, ref_y_stride,
-                                   SP(this_mv.as_mv.col),
-                                   SP(this_mv.as_mv.row),
-                                   above_src, xd->dst.y_stride, &sse);
-        score += sse;
-        if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
-          vp9_sub_pixel_variance16x2(above_ref + offset + 16,
-                                     ref_y_stride,
-                                     SP(this_mv.as_mv.col),
-                                     SP(this_mv.as_mv.row),
-                                     above_src + 16, xd->dst.y_stride, &sse);
-          score += sse;
-        }
-        if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {
-          vp9_sub_pixel_variance16x2(above_ref + offset + 32,
-                                     ref_y_stride,
-                                     SP(this_mv.as_mv.col),
-                                     SP(this_mv.as_mv.row),
-                                     above_src + 32, xd->dst.y_stride, &sse);
-          score += sse;
-          vp9_sub_pixel_variance16x2(above_ref + offset + 48,
-                                     ref_y_stride,
-                                     SP(this_mv.as_mv.col),
-                                     SP(this_mv.as_mv.row),
-                                     above_src + 48, xd->dst.y_stride, &sse);
-          score += sse;
-        }
-      }
-#if !CONFIG_ABOVESPREFMV
-      if (xd->left_available) {
-        vp9_sub_pixel_variance2x16_c(left_ref + offset, ref_y_stride,
-                                     SP(this_mv.as_mv.col),
-                                     SP(this_mv.as_mv.row),
-                                     left_src, xd->dst.y_stride, &sse);
-        score += sse;
-        if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
-          vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 16,
-                                       ref_y_stride,
-                                       SP(this_mv.as_mv.col),
-                                       SP(this_mv.as_mv.row),
-                                       left_src + xd->dst.y_stride * 16,
-                                       xd->dst.y_stride, &sse);
-          score += sse;
-        }
-        if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {
-          vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 32,
-                                     ref_y_stride,
-                                       SP(this_mv.as_mv.col),
-                                       SP(this_mv.as_mv.row),
-                                       left_src + xd->dst.y_stride * 32,
-                                       xd->dst.y_stride, &sse);
-          score += sse;
-          vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 48,
-                                       ref_y_stride,
-                                       SP(this_mv.as_mv.col),
-                                       SP(this_mv.as_mv.row),
-                                       left_src + xd->dst.y_stride * 48,
-                                       xd->dst.y_stride, &sse);
-          score += sse;
-        }
-      }
-#endif
-      // Add the entry to our list and then resort the list on score.
-      ref_scores[i] = score;
-      sorted_mvs[i].as_int = this_mv.as_int;
-      j = i;
-      while (j > 0) {
-        if (ref_scores[j] < ref_scores[j-1]) {
-          ref_scores[j] = ref_scores[j-1];
-          sorted_mvs[j].as_int = sorted_mvs[j-1].as_int;
-          ref_scores[j-1] = score;
-          sorted_mvs[j-1].as_int = this_mv.as_int;
-          j--;
-        } else {
-          break;
-        }
-      }
-    }
+    dst_list[dst++].as_int = bmi[0].as_mv[ref_idx].as_int;
+    for (n = 0; dst < MAX_MV_REF_CANDIDATES &&
+                n < MAX_MV_REF_CANDIDATES; n++)
+      if (mv_list[n].as_int != dst_list[0].as_int)
+        dst_list[dst++].as_int = mv_list[n].as_int;
   } else {
-    vpx_memcpy(sorted_mvs, mvlist, sizeof(sorted_mvs));
-  }
+    int dst = 0, n;
+    union b_mode_info *bmi = mi->bmi;
 
-  // Make sure all the candidates are properly clamped etc
-  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
-    lower_mv_precision(&sorted_mvs[i], xd->allow_high_precision_mv);
-    clamp_mv2(&sorted_mvs[i], xd);
+    assert(block_idx == 3);
+    dst_list[dst++].as_int = bmi[2].as_mv[ref_idx].as_int;
+    if (dst_list[0].as_int != bmi[1].as_mv[ref_idx].as_int)
+      dst_list[dst++].as_int = bmi[1].as_mv[ref_idx].as_int;
+    if (dst < MAX_MV_REF_CANDIDATES &&
+        dst_list[0].as_int != bmi[0].as_mv[ref_idx].as_int)
+      dst_list[dst++].as_int = bmi[0].as_mv[ref_idx].as_int;
+    for (n = 0; dst < MAX_MV_REF_CANDIDATES &&
+                n < MAX_MV_REF_CANDIDATES; n++)
+      if (mv_list[n].as_int != dst_list[0].as_int)
+        dst_list[dst++].as_int = mv_list[n].as_int;
   }
 
-  // Nearest may be a 0,0 or non zero vector and now matches the chosen
-  // "best reference". This has advantages when it is used as part of a
-  // compound predictor as it means a non zero vector can be paired using
-  // this mode with a 0 vector. The Near vector is still forced to be a
-  // non zero candidate if one is avaialble.
-  nearest->as_int = sorted_mvs[0].as_int;
-  if ( sorted_mvs[1].as_int ) {
-    near->as_int = sorted_mvs[1].as_int;
-  } else {
-    near->as_int = sorted_mvs[2].as_int;
-  }
-
-  // Copy back the re-ordered mv list
-  vpx_memcpy(mvlist, sorted_mvs, sizeof(sorted_mvs));
+  dst_nearest->as_int = dst_list[0].as_int;
+  dst_near->as_int = dst_list[1].as_int;
 }
-#else
-void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
-                           uint8_t *ref_y_buffer,
-                           int ref_y_stride,
-                           int_mv *mvlist,
-                           int_mv *nearest,
-                           int_mv *near) {
-  int i;
-  // Make sure all the candidates are properly clamped etc
-  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
-    lower_mv_precision(&mvlist[i], xd->allow_high_precision_mv);
-    clamp_mv2(&mvlist[i], xd);
-  }
-  *nearest = mvlist[0];
-  *near = mvlist[1];
-}
-#endif
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h
@@ -17,16 +17,13 @@
 #include "vp9/common/vp9_treecoder.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
-#define LEFT_TOP_MARGIN (16 << 3)
-#define RIGHT_BOTTOM_MARGIN (16 << 3)
+#define LEFT_TOP_MARGIN     ((VP9BORDERINPIXELS - VP9_INTERP_EXTEND) << 3)
+#define RIGHT_BOTTOM_MARGIN ((VP9BORDERINPIXELS - VP9_INTERP_EXTEND) << 3)
 
-/* check a list of motion vectors by sad score using a number rows of pixels
- * above and a number cols of pixels in the left to select the one with best
- * score to use as ref motion vector
- */
+// check a list of motion vectors by sad score using a number rows of pixels
+// above and a number cols of pixels in the left to select the one with best
+// score to use as ref motion vector
 void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
-                           uint8_t *ref_y_buffer,
-                           int ref_y_stride,
                            int_mv *mvlist,
                            int_mv *nearest,
                            int_mv *near);
@@ -43,35 +40,30 @@
   mvp->as_mv = xmv;
 }
 
-
+// TODO(jingning): this mv clamping function should be block size dependent.
 static void clamp_mv(int_mv *mv,
                      int mb_to_left_edge,
                      int mb_to_right_edge,
                      int mb_to_top_edge,
                      int mb_to_bottom_edge) {
-  mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?
-                  mb_to_left_edge : mv->as_mv.col;
-  mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ?
-                  mb_to_right_edge : mv->as_mv.col;
-  mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ?
-                  mb_to_top_edge : mv->as_mv.row;
-  mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?
-                  mb_to_bottom_edge : mv->as_mv.row;
+  mv->as_mv.col = clamp(mv->as_mv.col, mb_to_left_edge, mb_to_right_edge);
+  mv->as_mv.row = clamp(mv->as_mv.row, mb_to_top_edge, mb_to_bottom_edge);
 }
 
-static void clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {
+static int clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {
+  int_mv tmp_mv;
+  tmp_mv.as_int = mv->as_int;
   clamp_mv(mv,
            xd->mb_to_left_edge - LEFT_TOP_MARGIN,
            xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
            xd->mb_to_top_edge - LEFT_TOP_MARGIN,
            xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+  return tmp_mv.as_int != mv->as_int;
 }
 
-static unsigned int check_mv_bounds(int_mv *mv,
-                                    int mb_to_left_edge,
-                                    int mb_to_right_edge,
-                                    int mb_to_top_edge,
-                                    int mb_to_bottom_edge) {
+static int check_mv_bounds(int_mv *mv,
+                           int mb_to_left_edge, int mb_to_right_edge,
+                           int mb_to_top_edge, int mb_to_bottom_edge) {
   return mv->as_mv.col < mb_to_left_edge ||
          mv->as_mv.col > mb_to_right_edge ||
          mv->as_mv.row < mb_to_top_edge ||
@@ -79,116 +71,50 @@
 }
 
 vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,
-                           vp9_prob p[VP9_MVREFS - 1],
+                           vp9_prob p[VP9_INTER_MODES - 1],
                            const int context);
 
-extern const uint8_t vp9_mbsplit_offset[4][16];
+void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *pc,
+                                   MACROBLOCKD *xd,
+                                   int_mv *dst_nearest,
+                                   int_mv *dst_near,
+                                   int block_idx, int ref_idx);
 
-static int left_block_mv(const MACROBLOCKD *xd,
-                         const MODE_INFO *cur_mb, int b) {
-  if (!(b & 3)) {
-    if (!xd->left_available)
-      return 0;
-
-    // On L edge, get from MB to left of us
-    --cur_mb;
-
-    if (cur_mb->mbmi.mode != SPLITMV)
-      return cur_mb->mbmi.mv[0].as_int;
-
-    b += 4;
-  }
-
-  return (cur_mb->bmi + b - 1)->as_mv[0].as_int;
-}
-
-static int left_block_second_mv(const MACROBLOCKD *xd,
-                                const MODE_INFO *cur_mb, int b) {
-  if (!(b & 3)) {
-    if (!xd->left_available)
-      return 0;
-
+static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
+  // FIXME(rbultje, jingning): temporary hack because jenkins doesn't
+  // understand this condition. This will go away soon.
+  if (b == 0 || b == 2) {
     /* On L edge, get from MB to left of us */
     --cur_mb;
 
-    if (cur_mb->mbmi.mode != SPLITMV)
-      return cur_mb->mbmi.second_ref_frame > 0 ?
-          cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;
-    b += 4;
-  }
-
-  return cur_mb->mbmi.second_ref_frame > 0 ?
-      (cur_mb->bmi + b - 1)->as_mv[1].as_int :
-      (cur_mb->bmi + b - 1)->as_mv[0].as_int;
-}
-
-static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
-  if (!(b >> 2)) {
-    /* On top edge, get from MB above us */
-    cur_mb -= mi_stride;
-
-    if (cur_mb->mbmi.mode != SPLITMV)
-      return cur_mb->mbmi.mv[0].as_int;
-    b += 16;
-  }
-
-  return (cur_mb->bmi + b - 4)->as_mv[0].as_int;
-}
-
-static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
-  if (!(b >> 2)) {
-    /* On top edge, get from MB above us */
-    cur_mb -= mi_stride;
-
-    if (cur_mb->mbmi.mode != SPLITMV)
-      return cur_mb->mbmi.second_ref_frame > 0 ?
-          cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;
-    b += 16;
-  }
-
-  return cur_mb->mbmi.second_ref_frame > 0 ?
-      (cur_mb->bmi + b - 4)->as_mv[1].as_int :
-      (cur_mb->bmi + b - 4)->as_mv[0].as_int;
-}
-
-static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
-  if (!(b & 3)) {
-    /* On L edge, get from MB to left of us */
-    --cur_mb;
-
-    if (cur_mb->mbmi.mode < I8X8_PRED) {
-      return pred_mode_conv(cur_mb->mbmi.mode);
-    } else if (cur_mb->mbmi.mode == I8X8_PRED) {
-      return pred_mode_conv(
-          (MB_PREDICTION_MODE)(cur_mb->bmi + 3 + b)->as_mode.first);
-    } else if (cur_mb->mbmi.mode == B_PRED) {
-      return ((cur_mb->bmi + 3 + b)->as_mode.first);
+    if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) {
+      return DC_PRED;
+    } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+      return ((cur_mb->bmi + 1 + b)->as_mode.first);
     } else {
-      return B_DC_PRED;
+      return cur_mb->mbmi.mode;
     }
   }
+  assert(b == 1 || b == 3);
   return (cur_mb->bmi + b - 1)->as_mode.first;
 }
 
-static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
+static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
                                           int b, int mi_stride) {
-  if (!(b >> 2)) {
+  if (!(b >> 1)) {
     /* On top edge, get from MB above us */
     cur_mb -= mi_stride;
 
-    if (cur_mb->mbmi.mode < I8X8_PRED) {
-      return pred_mode_conv(cur_mb->mbmi.mode);
-    } else if (cur_mb->mbmi.mode == I8X8_PRED) {
-      return pred_mode_conv(
-          (MB_PREDICTION_MODE)(cur_mb->bmi + 12 + b)->as_mode.first);
-    } else if (cur_mb->mbmi.mode == B_PRED) {
-      return ((cur_mb->bmi + 12 + b)->as_mode.first);
+    if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) {
+      return DC_PRED;
+    } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+      return ((cur_mb->bmi + 2 + b)->as_mode.first);
     } else {
-      return B_DC_PRED;
+      return cur_mb->mbmi.mode;
     }
   }
 
-  return (cur_mb->bmi + b - 4)->as_mode.first;
+  return (cur_mb->bmi + b - 2)->as_mode.first;
 }
 
 #endif  // VP9_COMMON_VP9_FINDNEARMV_H_
--- a/vp9/common/vp9_header.h
+++ /dev/null
@@ -1,40 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_HEADER_H_
-#define VP9_COMMON_VP9_HEADER_H_
-
-/* 24 bits total */
-typedef struct {
-  unsigned int type: 1;
-  unsigned int version: 3;
-  unsigned int show_frame: 1;
-
-  /* Allow 2^20 bytes = 8 megabits for first partition */
-
-  unsigned int first_partition_length_in_bytes: 19;
-
-#ifdef PACKET_TESTING
-  unsigned int frame_number;
-  unsigned int update_gold: 1;
-  unsigned int uses_gold: 1;
-  unsigned int update_last: 1;
-  unsigned int uses_last: 1;
-#endif
-
-} VP9_HEADER;
-
-#ifdef PACKET_TESTING
-#define VP9_HEADER_SIZE 8
-#else
-#define VP9_HEADER_SIZE 3
-#endif
-
-#endif  // VP9_COMMON_VP9_HEADER_H_
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -18,84 +18,84 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+   0.5 shifts per pixel. */
   int i;
-  int a1, b1, c1, d1;
+  int16_t output[16];
+  int a1, b1, c1, d1, e1;
   int16_t *ip = input;
   int16_t *op = output;
-  const int half_pitch = pitch >> 1;
 
   for (i = 0; i < 4; i++) {
-    a1 = (ip[0] + ip[3]) >> WHT_UPSCALE_FACTOR;
-    b1 = (ip[1] + ip[2]) >> WHT_UPSCALE_FACTOR;
-    c1 = (ip[1] - ip[2]) >> WHT_UPSCALE_FACTOR;
-    d1 = (ip[0] - ip[3]) >> WHT_UPSCALE_FACTOR;
-
-    op[0] = (a1 + b1 + 1) >> 1;
-    op[1] = (c1 + d1) >> 1;
-    op[2] = (a1 - b1) >> 1;
-    op[3] = (d1 - c1) >> 1;
-
+    a1 = ip[0] >> WHT_UPSCALE_FACTOR;
+    c1 = ip[1] >> WHT_UPSCALE_FACTOR;
+    d1 = ip[2] >> WHT_UPSCALE_FACTOR;
+    b1 = ip[3] >> WHT_UPSCALE_FACTOR;
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+    op[0] = a1;
+    op[1] = b1;
+    op[2] = c1;
+    op[3] = d1;
     ip += 4;
-    op += half_pitch;
+    op += 4;
   }
 
   ip = output;
-  op = output;
   for (i = 0; i < 4; i++) {
-    a1 = ip[half_pitch * 0] + ip[half_pitch * 3];
-    b1 = ip[half_pitch * 1] + ip[half_pitch * 2];
-    c1 = ip[half_pitch * 1] - ip[half_pitch * 2];
-    d1 = ip[half_pitch * 0] - ip[half_pitch * 3];
+    a1 = ip[4 * 0];
+    c1 = ip[4 * 1];
+    d1 = ip[4 * 2];
+    b1 = ip[4 * 3];
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
+    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + b1);
+    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + c1);
+    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + d1);
 
-
-    op[half_pitch * 0] = (a1 + b1 + 1) >> 1;
-    op[half_pitch * 1] = (c1 + d1) >> 1;
-    op[half_pitch * 2] = (a1 - b1) >> 1;
-    op[half_pitch * 3] = (d1 - c1) >> 1;
-
     ip++;
-    op++;
+    dest++;
   }
 }
 
-void vp9_short_iwalsh4x4_1_c(int16_t *in, int16_t *out, int pitch) {
+void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {
   int i;
+  int a1, e1;
   int16_t tmp[4];
   int16_t *ip = in;
   int16_t *op = tmp;
-  const int half_pitch = pitch >> 1;
 
-  op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;
-  op[1] = op[2] = op[3] = (ip[0] >> WHT_UPSCALE_FACTOR) >> 1;
+  a1 = ip[0] >> WHT_UPSCALE_FACTOR;
+  e1 = a1 >> 1;
+  a1 -= e1;
+  op[0] = a1;
+  op[1] = op[2] = op[3] = e1;
 
   ip = tmp;
-  op = out;
   for (i = 0; i < 4; i++) {
-    op[half_pitch * 0] = (ip[0] + 1) >> 1;
-    op[half_pitch * 1] = op[half_pitch * 2] = op[half_pitch * 3] = ip[0] >> 1;
+    e1 = ip[0] >> 1;
+    a1 = ip[0] - e1;
+    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
+    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1);
+    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1);
+    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);
     ip++;
-    op++;
+    dest++;
   }
 }
 
-void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr,
-                                 uint8_t *dst_ptr,
-                                 int pitch, int stride) {
-  int r, c;
-  int16_t dc = input_dc;
-  int16_t tmp[4 * 4];
-  vp9_short_iwalsh4x4_1_c(&dc, tmp, 4 << 1);
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++)
-      dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]);
-
-    dst_ptr += stride;
-    pred_ptr += pitch;
-  }
-}
-
 void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
   int16_t step[4];
   int temp1, temp2;
@@ -116,10 +116,9 @@
   output[3] = step[0] - step[3];
 }
 
-void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int16_t out[4 * 4];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[4], temp_out[4];
 
@@ -138,22 +137,24 @@
       temp_in[j] = out[j * 4 + i];
     vp9_idct4_1d(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                  + dest[j * dest_stride + i]);
   }
 }
 
-void vp9_short_idct4x4_1_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int i;
   int a1;
-  int16_t *op = output;
-  const int half_pitch = pitch >> 1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
-    op[0] = op[1] = op[2] = op[3] = a1;
-    op += half_pitch;
+    dest[0] = clip_pixel(dest[0] + a1);
+    dest[1] = clip_pixel(dest[1] + a1);
+    dest[2] = clip_pixel(dest[2] + a1);
+    dest[3] = clip_pixel(dest[3] + a1);
+    dest += dest_stride;
   }
 }
 
@@ -219,14 +220,13 @@
   output[7] = step1[0] - step1[7];
 }
 
-void vp9_short_idct8x8_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int16_t out[8 * 8];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[8], temp_out[8];
 
-  // Rows
+  // First transform rows
   for (i = 0; i < 8; ++i) {
     idct8_1d(input, outptr);
     input += 8;
@@ -233,13 +233,14 @@
     outptr += 8;
   }
 
-  // Columns
+  // Then transform columns
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j)
       temp_in[j] = out[j * 8 + i];
     idct8_1d(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                  + dest[j * dest_stride + i]);
   }
 }
 
@@ -285,8 +286,8 @@
   output[3] = dct_const_round_shift(s3);
 }
 
-void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
-                        int pitch, int tx_type) {
+void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride,
+                            int tx_type) {
   const transform_2d IHT_4[] = {
     { vp9_idct4_1d,  vp9_idct4_1d  },  // DCT_DCT  = 0
     { iadst4_1d, vp9_idct4_1d  },      // ADST_DCT = 1
@@ -312,10 +313,10 @@
       temp_in[j] = out[j * 4 + i];
     IHT_4[tx_type].cols(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
-      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                  + dest[j * dest_stride + i]);
   }
 }
-
 static void iadst8_1d(int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
@@ -400,8 +401,8 @@
   { iadst8_1d, iadst8_1d }   // ADST_ADST = 3
 };
 
-void vp9_short_iht8x8_c(int16_t *input, int16_t *output,
-                        int pitch, int tx_type) {
+void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride,
+                            int tx_type) {
   int i, j;
   int16_t out[8 * 8];
   int16_t *outptr = out;
@@ -421,14 +422,14 @@
       temp_in[j] = out[j * 8 + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
-      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
-  }
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                  + dest[j * dest_stride + i]);  }
 }
 
-void vp9_short_idct10_8x8_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
+                                int dest_stride) {
   int16_t out[8 * 8];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[8], temp_out[8];
 
@@ -447,7 +448,8 @@
       temp_in[j] = out[j * 8 + i];
     idct8_1d(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                  + dest[j * dest_stride + i]);
   }
 }
 
@@ -621,10 +623,9 @@
   output[15] = step2[0] - step2[15];
 }
 
-void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int16_t out[16 * 16];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[16], temp_out[16];
 
@@ -641,7 +642,8 @@
       temp_in[j] = out[j * 16 + i];
     idct16_1d(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);
   }
 }
 
@@ -823,8 +825,8 @@
   { iadst16_1d, iadst16_1d }   // ADST_ADST = 3
 };
 
-void vp9_short_iht16x16_c(int16_t *input, int16_t *output,
-                          int pitch, int tx_type) {
+void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
+                              int tx_type) {
   int i, j;
   int16_t out[16 * 16];
   int16_t *outptr = out;
@@ -844,38 +846,38 @@
       temp_in[j] = out[j * 16 + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
-  }
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);  }
 }
 
-void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {
-    int16_t out[16 * 16];
-    int16_t *outptr = out;
-    const int half_pitch = pitch >> 1;
-    int i, j;
-    int16_t temp_in[16], temp_out[16];
+void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
+                                  int dest_stride) {
+  int16_t out[16 * 16];
+  int16_t *outptr = out;
+  int i, j;
+  int16_t temp_in[16], temp_out[16];
 
-    /* First transform rows. Since all non-zero dct coefficients are in
-     * upper-left 4x4 area, we only need to calculate first 4 rows here.
-     */
-    vpx_memset(out, 0, sizeof(out));
-    for (i = 0; i < 4; ++i) {
-      idct16_1d(input, outptr);
-      input += 16;
-      outptr += 16;
-    }
+  /* First transform rows. Since all non-zero dct coefficients are in
+   * upper-left 4x4 area, we only need to calculate first 4 rows here.
+   */
+  vpx_memset(out, 0, sizeof(out));
+  for (i = 0; i < 4; ++i) {
+    idct16_1d(input, outptr);
+    input += 16;
+    outptr += 16;
+  }
 
-    // Then transform columns
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = out[j*16 + i];
-      idct16_1d(temp_in, temp_out);
-      for (j = 0; j < 16; ++j)
-        output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
-    }
+  // Then transform columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j*16 + i];
+    idct16_1d(temp_in, temp_out);
+    for (j = 0; j < 16; ++j)
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);
+  }
 }
 
-
 void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
@@ -1249,10 +1251,9 @@
   output[31] = step1[0] - step1[31];
 }
 
-void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int16_t out[32 * 32];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[32], temp_out[32];
 
@@ -1269,7 +1270,8 @@
       temp_in[j] = out[j * 32 + i];
     idct32_1d(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);
   }
 }
 
@@ -1279,10 +1281,10 @@
   output[0] = ROUND_POWER_OF_TWO(out, 6);
 }
 
-void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest,
+                                  int dest_stride) {
   int16_t out[32 * 32];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[32], temp_out[32];
 
@@ -1302,6 +1304,7 @@
       temp_in[j] = out[j * 32 + i];
     idct32_1d(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);
   }
 }
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -17,6 +17,7 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_common.h"
 
+
 // Constants and Macros used by all idct/dct functions
 #define DCT_CONST_BITS 14
 #define DCT_CONST_ROUNDING  (1 << (DCT_CONST_BITS - 1))
--- a/vp9/common/vp9_implicit_segmentation.c
+++ b/vp9/common/vp9_implicit_segmentation.c
@@ -140,11 +140,11 @@
           break;
         case SEGMENT_MV:
           n = mi[mb_index].mbmi.mv[0].as_int;
-          if (mi[mb_index].mbmi.ref_frame == INTRA_FRAME)
+          if (mi[mb_index].mbmi.ref_frame[0] == INTRA_FRAME)
             n = -9999999;
           break;
         case SEGMENT_REFFRAME:
-          n = mi[mb_index].mbmi.ref_frame;
+          n = mi[mb_index].mbmi.ref_frame[0];
           break;
         case SEGMENT_SKIPPED:
           n = mi[mb_index].mbmi.mb_skip_coeff;
@@ -191,11 +191,12 @@
 
   // give new labels to regions
   for (i = 1; i < label; i++)
-    if (labels[i].next->count > min_mbs_in_region  &&  labels[labels[i].next->label].label == 0) {
+    if (labels[i].next->count > min_mbs_in_region &&
+        labels[labels[i].next->label].label == 0) {
       segment_info *cs = &segments[label_count];
       cs->label = label_count;
       labels[labels[i].next->label].label = label_count++;
-      labels[labels[i].next->label].seg_value  = labels[i].next->seg_value;
+      labels[labels[i].next->label].seg_value = labels[i].next->seg_value;
       cs->seg_value = labels[labels[i].next->label].seg_value;
       cs->min_x = oci->mb_cols;
       cs->min_y = oci->mb_rows;
@@ -204,24 +205,21 @@
       cs->sum_x = 0;
       cs->sum_y = 0;
       cs->pixels = 0;
-
     }
+
   lp = labeling;
 
   // this is just to gather stats...
   for (i = 0; i < oci->mb_rows; i++, lp += pitch) {
     for (j = 0; j < oci->mb_cols; j++) {
-      segment_info *cs;
-      int oldlab = labels[lp[j]].next->label;
-      int lab = labels[oldlab].label;
-      lp[j] = lab;
+      const int old_lab = labels[lp[j]].next->label;
+      const int lab = labels[old_lab].label;
+      segment_info *cs = &segments[lab];
 
-      cs = &segments[lab];
-
-      cs->min_x = (j < cs->min_x ? j : cs->min_x);
-      cs->max_x = (j > cs->max_x ? j : cs->max_x);
-      cs->min_y = (i < cs->min_y ? i : cs->min_y);
-      cs->max_y = (i > cs->max_y ? i : cs->max_y);
+      cs->min_x = MIN(cs->min_x, j);
+      cs->max_x = MAX(cs->max_x, j);
+      cs->min_y = MIN(cs->min_y, i);
+      cs->max_y = MAX(cs->max_y, i);
       cs->sum_x += j;
       cs->sum_y += i;
       cs->pixels++;
--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@@ -11,311 +11,10 @@
 #include "vp9/common/vp9_invtrans.h"
 #include "./vp9_rtcd.h"
 
-void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,
-                                 int16_t *dqcoeff, int16_t *diff,
-                                 int pitch) {
+void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff,
+                                     uint8_t *dest, int stride) {
   if (eob <= 1)
-    xd->inv_txm4x4_1(dqcoeff, diff, pitch);
+    xd->inv_txm4x4_1_add(dqcoeff, dest, stride);
   else
-    xd->inv_txm4x4(dqcoeff, diff, pitch);
-}
-
-void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) {
-  int i;
-
-  for (i = 0; i < 16; i++) {
-    TX_TYPE tx_type = get_tx_type_4x4(xd, i);
-    if (tx_type != DCT_DCT) {
-      vp9_short_iht4x4(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type);
-    } else {
-      vp9_inverse_transform_b_4x4(xd, xd->eobs[i], xd->block[i].dqcoeff,
-                                  xd->block[i].diff, 32);
-    }
-  }
-}
-
-void vp9_inverse_transform_mbuv_4x4(MACROBLOCKD *xd) {
-  int i;
-
-  for (i = 16; i < 24; i++) {
-    vp9_inverse_transform_b_4x4(xd, xd->eobs[i], xd->block[i].dqcoeff,
-                                xd->block[i].diff, 16);
-  }
-}
-
-void vp9_inverse_transform_mb_4x4(MACROBLOCKD *xd) {
-  vp9_inverse_transform_mby_4x4(xd);
-  vp9_inverse_transform_mbuv_4x4(xd);
-}
-
-void vp9_inverse_transform_b_8x8(int16_t *input_dqcoeff, int16_t *output_coeff,
-                                 int pitch) {
-  vp9_short_idct8x8(input_dqcoeff, output_coeff, pitch);
-}
-
-void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) {
-  int i;
-  BLOCKD *blockd = xd->block;
-
-  for (i = 0; i < 9; i += 8) {
-    TX_TYPE tx_type = get_tx_type_8x8(xd, i);
-    if (tx_type != DCT_DCT) {
-      vp9_short_iht8x8(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type);
-    } else {
-      vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0],
-                                  &blockd[i].diff[0], 32);
-    }
-  }
-  for (i = 2; i < 11; i += 8) {
-    TX_TYPE tx_type = get_tx_type_8x8(xd, i);
-    if (tx_type != DCT_DCT) {
-      vp9_short_iht8x8(xd->block[i + 2].dqcoeff, xd->block[i].diff,
-                           16, tx_type);
-    } else {
-      vp9_inverse_transform_b_8x8(&blockd[i + 2].dqcoeff[0],
-                                  &blockd[i].diff[0], 32);
-    }
-  }
-}
-
-void vp9_inverse_transform_mbuv_8x8(MACROBLOCKD *xd) {
-  int i;
-  BLOCKD *blockd = xd->block;
-
-  for (i = 16; i < 24; i += 4) {
-    vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0],
-                                &blockd[i].diff[0], 16);
-  }
-}
-
-void vp9_inverse_transform_mb_8x8(MACROBLOCKD *xd) {
-  vp9_inverse_transform_mby_8x8(xd);
-  vp9_inverse_transform_mbuv_8x8(xd);
-}
-
-void vp9_inverse_transform_b_16x16(int16_t *input_dqcoeff,
-                                   int16_t *output_coeff, int pitch) {
-  vp9_short_idct16x16(input_dqcoeff, output_coeff, pitch);
-}
-
-void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd) {
-  BLOCKD *bd = &xd->block[0];
-  TX_TYPE tx_type = get_tx_type_16x16(xd, 0);
-  if (tx_type != DCT_DCT) {
-    vp9_short_iht16x16(bd->dqcoeff, bd->diff, 16, tx_type);
-  } else {
-    vp9_inverse_transform_b_16x16(&xd->block[0].dqcoeff[0],
-                                  &xd->block[0].diff[0], 32);
-  }
-}
-
-void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd) {
-  vp9_inverse_transform_mby_16x16(xd);
-  vp9_inverse_transform_mbuv_8x8(xd);
-}
-
-void vp9_inverse_transform_sby_32x32(MACROBLOCKD *xd) {
-  vp9_short_idct32x32(xd->dqcoeff, xd->diff, 64);
-}
-
-void vp9_inverse_transform_sby_16x16(MACROBLOCKD *xd) {
-  int n;
-
-  for (n = 0; n < 4; n++) {
-    const int x_idx = n & 1, y_idx = n >> 1;
-    const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 8 + x_idx) * 4);
-
-    if (tx_type == DCT_DCT) {
-      vp9_inverse_transform_b_16x16(xd->dqcoeff + n * 256,
-                                    xd->diff + x_idx * 16 + y_idx * 32 * 16,
-                                    64);
-    } else {
-      vp9_short_iht16x16(xd->dqcoeff + n * 256,
-                         xd->diff + x_idx * 16 + y_idx * 32 * 16, 32, tx_type);
-    }
-  }
-}
-
-void vp9_inverse_transform_sby_8x8(MACROBLOCKD *xd) {
-  int n;
-
-  for (n = 0; n < 16; n++) {
-    const int x_idx = n & 3, y_idx = n >> 2;
-    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 8 + x_idx) * 2);
-
-    if (tx_type == DCT_DCT) {
-      vp9_inverse_transform_b_8x8(xd->dqcoeff + n * 64,
-                                  xd->diff + x_idx * 8 + y_idx * 32 * 8, 64);
-    } else {
-      vp9_short_iht8x8(xd->dqcoeff + n * 64,
-                       xd->diff + x_idx * 8 + y_idx * 32 * 8, 32, tx_type);
-    }
-  }
-}
-
-void vp9_inverse_transform_sby_4x4(MACROBLOCKD *xd) {
-  int n;
-
-  for (n = 0; n < 64; n++) {
-    const int x_idx = n & 7, y_idx = n >> 3;
-    const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 8 + x_idx);
-
-    if (tx_type == DCT_DCT) {
-      vp9_inverse_transform_b_4x4(xd, xd->eobs[n], xd->dqcoeff + n * 16,
-                                  xd->diff + x_idx * 4 + y_idx * 4 * 32, 64);
-    } else {
-      vp9_short_iht4x4(xd->dqcoeff + n * 16,
-                       xd->diff + x_idx * 4 + y_idx * 4 * 32, 32, tx_type);
-    }
-  }
-}
-
-void vp9_inverse_transform_sbuv_16x16(MACROBLOCKD *xd) {
-  vp9_inverse_transform_b_16x16(xd->dqcoeff + 1024,
-                                xd->diff + 1024, 32);
-  vp9_inverse_transform_b_16x16(xd->dqcoeff + 1280,
-                                xd->diff + 1280, 32);
-}
-
-void vp9_inverse_transform_sbuv_8x8(MACROBLOCKD *xd) {
-  int n;
-
-  for (n = 0; n < 4; n++) {
-    const int x_idx = n & 1, y_idx = n >> 1;
-
-    vp9_inverse_transform_b_8x8(xd->dqcoeff + 1024 + n * 64,
-                                xd->diff + 1024 + x_idx * 8 + y_idx * 16 * 8,
-                                32);
-    vp9_inverse_transform_b_8x8(xd->dqcoeff + 1280 + n * 64,
-                                xd->diff + 1280 + x_idx * 8 + y_idx * 16 * 8,
-                                32);
-  }
-}
-
-void vp9_inverse_transform_sbuv_4x4(MACROBLOCKD *xd) {
-  int n;
-
-  for (n = 0; n < 16; n++) {
-    const int x_idx = n & 3, y_idx = n >> 2;
-
-    vp9_inverse_transform_b_4x4(xd, xd->eobs[64 + n],
-                                xd->dqcoeff + 1024 + n * 16,
-                                xd->diff + 1024 + x_idx * 4 + y_idx * 16 * 4,
-                                32);
-    vp9_inverse_transform_b_4x4(xd, xd->eobs[64 + 16 + n],
-                                xd->dqcoeff + 1280 + n * 16,
-                                xd->diff + 1280 + x_idx * 4 + y_idx * 16 * 4,
-                                32);
-  }
-}
-
-void vp9_inverse_transform_sb64y_32x32(MACROBLOCKD *xd) {
-  int n;
-
-  for (n = 0; n < 4; n++) {
-    const int x_idx = n & 1, y_idx = n >> 1;
-
-    vp9_short_idct32x32(xd->dqcoeff + n * 1024,
-                        xd->diff + x_idx * 32 + y_idx * 32 * 64, 128);
-  }
-}
-
-void vp9_inverse_transform_sb64y_16x16(MACROBLOCKD *xd) {
-  int n;
-
-  for (n = 0; n < 16; n++) {
-    const int x_idx = n & 3, y_idx = n >> 2;
-    const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 16 + x_idx) * 4);
-
-    if (tx_type == DCT_DCT) {
-      vp9_inverse_transform_b_16x16(xd->dqcoeff + n * 256,
-                                    xd->diff + x_idx * 16 + y_idx * 64 * 16,
-                                    128);
-    } else {
-      vp9_short_iht16x16(xd->dqcoeff + n * 256,
-                         xd->diff + x_idx * 16 + y_idx * 64 * 16, 64, tx_type);
-    }
-  }
-}
-
-void vp9_inverse_transform_sb64y_8x8(MACROBLOCKD *xd) {
-  int n;
-
-  for (n = 0; n < 64; n++) {
-    const int x_idx = n & 7, y_idx = n >> 3;
-    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 16 + x_idx) * 2);
-
-    if (tx_type == DCT_DCT) {
-      vp9_inverse_transform_b_8x8(xd->dqcoeff + n * 64,
-                                  xd->diff + x_idx * 8 + y_idx * 64 * 8, 128);
-    } else {
-      vp9_short_iht8x8(xd->dqcoeff + n * 64,
-                       xd->diff + x_idx * 8 + y_idx * 64 * 8, 64, tx_type);
-    }
-  }
-}
-
-void vp9_inverse_transform_sb64y_4x4(MACROBLOCKD *xd) {
-  int n;
-
-  for (n = 0; n < 256; n++) {
-    const int x_idx = n & 15, y_idx = n >> 4;
-    const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 16 + x_idx);
-
-    if (tx_type == DCT_DCT) {
-      vp9_inverse_transform_b_4x4(xd, xd->eobs[n], xd->dqcoeff + n * 16,
-                                  xd->diff + x_idx * 4 + y_idx * 4 * 64, 128);
-    } else {
-      vp9_short_iht4x4(xd->dqcoeff + n * 16,
-                       xd->diff + x_idx * 4 + y_idx * 4 * 64, 64, tx_type);
-    }
-  }
-}
-
-void vp9_inverse_transform_sb64uv_32x32(MACROBLOCKD *xd) {
-  vp9_short_idct32x32(xd->dqcoeff + 4096,
-                      xd->diff + 4096, 64);
-  vp9_short_idct32x32(xd->dqcoeff + 4096 + 1024,
-                      xd->diff + 4096 + 1024, 64);
-}
-
-void vp9_inverse_transform_sb64uv_16x16(MACROBLOCKD *xd) {
-  int n;
-
-  for (n = 0; n < 4; n++) {
-    const int x_idx = n & 1, y_idx = n >> 1, off = x_idx * 16 + y_idx * 32 * 16;
-
-    vp9_inverse_transform_b_16x16(xd->dqcoeff + 4096 + n * 256,
-                                  xd->diff + 4096 + off, 64);
-    vp9_inverse_transform_b_16x16(xd->dqcoeff + 4096 + 1024 + n * 256,
-                                  xd->diff + 4096 + 1024 + off, 64);
-  }
-}
-
-void vp9_inverse_transform_sb64uv_8x8(MACROBLOCKD *xd) {
-  int n;
-
-  for (n = 0; n < 16; n++) {
-    const int x_idx = n & 3, y_idx = n >> 2, off = x_idx * 8 + y_idx * 32 * 8;
-
-    vp9_inverse_transform_b_8x8(xd->dqcoeff + 4096 + n * 64,
-                                xd->diff + 4096 + off, 64);
-    vp9_inverse_transform_b_8x8(xd->dqcoeff + 4096 + 1024 + n * 64,
-                                xd->diff + 4096 + 1024 + off, 64);
-  }
-}
-
-void vp9_inverse_transform_sb64uv_4x4(MACROBLOCKD *xd) {
-  int n;
-
-  for (n = 0; n < 64; n++) {
-    const int x_idx = n & 7, y_idx = n >> 3, off = x_idx * 4 + y_idx * 32 * 4;
-
-    vp9_inverse_transform_b_4x4(xd, xd->eobs[256 + n],
-                                xd->dqcoeff + 4096 + n * 16,
-                                xd->diff + 4096 + off, 64);
-    vp9_inverse_transform_b_4x4(xd, xd->eobs[256 + 64 + n],
-                                xd->dqcoeff + 4096 + 1024 + n * 16,
-                                xd->diff + 4096 + 1024 + off, 64);
-  }
+    xd->inv_txm4x4_add(dqcoeff, dest, stride);
 }
--- a/vp9/common/vp9_invtrans.h
+++ b/vp9/common/vp9_invtrans.h
@@ -15,47 +15,6 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
-void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,
-                                 int16_t *dqcoeff, int16_t *diff,
-                                 int pitch);
-
-void vp9_inverse_transform_mb_4x4(MACROBLOCKD *xd);
-
-void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd);
-
-void vp9_inverse_transform_mbuv_4x4(MACROBLOCKD *xd);
-
-void vp9_inverse_transform_b_8x8(int16_t *input_dqcoeff,
-                                        int16_t *output_coeff, int pitch);
-
-void vp9_inverse_transform_mb_8x8(MACROBLOCKD *xd);
-
-void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd);
-
-void vp9_inverse_transform_mbuv_8x8(MACROBLOCKD *xd);
-
-void vp9_inverse_transform_b_16x16(int16_t *input_dqcoeff,
-                                          int16_t *output_coeff, int pitch);
-
-void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd);
-
-void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd);
-
-void vp9_inverse_transform_sby_32x32(MACROBLOCKD *xd);
-void vp9_inverse_transform_sby_16x16(MACROBLOCKD *xd);
-void vp9_inverse_transform_sby_8x8(MACROBLOCKD *xd);
-void vp9_inverse_transform_sby_4x4(MACROBLOCKD *xd);
-void vp9_inverse_transform_sbuv_16x16(MACROBLOCKD *xd);
-void vp9_inverse_transform_sbuv_8x8(MACROBLOCKD *xd);
-void vp9_inverse_transform_sbuv_4x4(MACROBLOCKD *xd);
-
-void vp9_inverse_transform_sb64y_32x32(MACROBLOCKD *xd);
-void vp9_inverse_transform_sb64y_16x16(MACROBLOCKD *xd);
-void vp9_inverse_transform_sb64y_8x8(MACROBLOCKD *xd);
-void vp9_inverse_transform_sb64y_4x4(MACROBLOCKD *xd);
-void vp9_inverse_transform_sb64uv_32x32(MACROBLOCKD *xd);
-void vp9_inverse_transform_sb64uv_16x16(MACROBLOCKD *xd);
-void vp9_inverse_transform_sb64uv_8x8(MACROBLOCKD *xd);
-void vp9_inverse_transform_sb64uv_4x4(MACROBLOCKD *xd);
-
+void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff,
+                                     uint8_t *dest, int stride);
 #endif  // VP9_COMMON_VP9_INVTRANS_H_
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -11,46 +11,26 @@
 #include "vpx_config.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_reconinter.h"
 #include "vpx_mem/vpx_mem.h"
 
 #include "vp9/common/vp9_seg_common.h"
 
 static void lf_init_lut(loop_filter_info_n *lfi) {
-  int filt_lvl;
-
-  for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++) {
-    if (filt_lvl >= 40) {
-      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2;
-      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3;
-    } else if (filt_lvl >= 20) {
-      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
-      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2;
-    } else if (filt_lvl >= 15) {
-      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
-      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1;
-    } else {
-      lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0;
-      lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0;
-    }
-  }
-
-  lfi->mode_lf_lut[DC_PRED] = 1;
-  lfi->mode_lf_lut[D45_PRED] = 1;
-  lfi->mode_lf_lut[D135_PRED] = 1;
-  lfi->mode_lf_lut[D117_PRED] = 1;
-  lfi->mode_lf_lut[D153_PRED] = 1;
-  lfi->mode_lf_lut[D27_PRED] = 1;
-  lfi->mode_lf_lut[D63_PRED] = 1;
-  lfi->mode_lf_lut[V_PRED] = 1;
-  lfi->mode_lf_lut[H_PRED] = 1;
-  lfi->mode_lf_lut[TM_PRED] = 1;
-  lfi->mode_lf_lut[B_PRED]  = 0;
-  lfi->mode_lf_lut[I8X8_PRED] = 0;
-  lfi->mode_lf_lut[ZEROMV]  = 1;
-  lfi->mode_lf_lut[NEARESTMV] = 2;
-  lfi->mode_lf_lut[NEARMV] = 2;
-  lfi->mode_lf_lut[NEWMV] = 2;
-  lfi->mode_lf_lut[SPLITMV] = 3;
+  lfi->mode_lf_lut[DC_PRED] = 0;
+  lfi->mode_lf_lut[D45_PRED] = 0;
+  lfi->mode_lf_lut[D135_PRED] = 0;
+  lfi->mode_lf_lut[D117_PRED] = 0;
+  lfi->mode_lf_lut[D153_PRED] = 0;
+  lfi->mode_lf_lut[D27_PRED] = 0;
+  lfi->mode_lf_lut[D63_PRED] = 0;
+  lfi->mode_lf_lut[V_PRED] = 0;
+  lfi->mode_lf_lut[H_PRED] = 0;
+  lfi->mode_lf_lut[TM_PRED] = 0;
+  lfi->mode_lf_lut[ZEROMV]  = 0;
+  lfi->mode_lf_lut[NEARESTMV] = 1;
+  lfi->mode_lf_lut[NEARMV] = 1;
+  lfi->mode_lf_lut[NEWMV] = 1;
 }
 
 void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
@@ -86,25 +66,28 @@
   loop_filter_info_n *lfi = &cm->lf_info;
   int i;
 
-  /* init limits for given sharpness*/
+  // init limits for given sharpness
   vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);
   cm->last_sharpness_level = cm->sharpness_level;
 
-  /* init LUT for lvl  and hev thr picking */
+  // init LUT for lvl  and hev thr picking
   lf_init_lut(lfi);
 
-  /* init hev threshold const vectors */
-  for (i = 0; i < 4; i++) {
+  // init hev threshold const vectors
+  for (i = 0; i < 4; i++)
     vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
-  }
 }
 
 void vp9_loop_filter_frame_init(VP9_COMMON *cm,
                                 MACROBLOCKD *xd,
                                 int default_filt_lvl) {
-  int seg,  /* segment number */
-      ref,  /* index in ref_lf_deltas */
-      mode; /* index in mode_lf_deltas */
+  int seg,    // segment number
+      ref,    // index in ref_lf_deltas
+      mode;   // index in mode_lf_deltas
+  // n_shift is the a multiplier for lf_deltas
+  // the multiplier is 1 for when filter_lvl is between 0 and 31;
+  // 2 when filter_lvl is between 32 and 63
+  int n_shift = default_filt_lvl >> 5;
 
   loop_filter_info_n *lfi = &cm->lf_info;
 
@@ -147,360 +130,278 @@
     ref = INTRA_FRAME;
 
     /* Apply delta for reference frame */
-    lvl_ref += xd->ref_lf_deltas[ref];
+    lvl_ref += xd->ref_lf_deltas[ref] << n_shift;
 
-    /* Apply delta for Intra modes */
-    mode = 0; /* B_PRED */
-    /* Only the split mode BPRED has a further special case */
-    lvl_mode = clamp(lvl_ref +  xd->mode_lf_deltas[mode], 0, 63);
+    mode = 0; /* all the rest of Intra modes */
+    lvl_mode = lvl_ref;
+    lfi->lvl[seg][ref][mode] = clamp(lvl_mode, 0, 63);
 
-    lfi->lvl[seg][ref][mode] = lvl_mode;
-
-    mode = 1; /* all the rest of Intra modes */
-    lvl_mode = clamp(lvl_ref, 0, 63);
-    lfi->lvl[seg][ref][mode] = lvl_mode;
-
     /* LAST, GOLDEN, ALT */
     for (ref = 1; ref < MAX_REF_FRAMES; ref++) {
       int lvl_ref = lvl_seg;
 
       /* Apply delta for reference frame */
-      lvl_ref += xd->ref_lf_deltas[ref];
+      lvl_ref += xd->ref_lf_deltas[ref] << n_shift;
 
       /* Apply delta for Inter modes */
-      for (mode = 1; mode < 4; mode++) {
-        lvl_mode = clamp(lvl_ref + xd->mode_lf_deltas[mode], 0, 63);
-        lfi->lvl[seg][ref][mode] = lvl_mode;
+      for (mode = 0; mode < MAX_MODE_LF_DELTAS; mode++) {
+        lvl_mode = lvl_ref + (xd->mode_lf_deltas[mode] << n_shift);
+        lfi->lvl[seg][ref][mode] = clamp(lvl_mode, 0, 63);
       }
     }
   }
 }
 
-// Determine if we should skip inner-MB loop filtering within a MB
-// The current condition is that the loop filtering is skipped only
-// the MB uses a prediction size of 16x16 and either 16x16 transform
-// is used or there is no residue at all.
-static int mb_lf_skip(const MB_MODE_INFO *const mbmi) {
-  const MB_PREDICTION_MODE mode = mbmi->mode;
-  const int skip_coef = mbmi->mb_skip_coeff;
-  const int tx_size = mbmi->txfm_size;
-  return mode != B_PRED && mode != I8X8_PRED && mode != SPLITMV &&
-         (tx_size >= TX_16X16 || skip_coef);
-}
+static int build_lfi(const VP9_COMMON *cm, const MB_MODE_INFO *mbmi,
+                      struct loop_filter_info *lfi) {
+  const loop_filter_info_n *lfi_n = &cm->lf_info;
+  int mode = mbmi->mode;
+  int mode_index = lfi_n->mode_lf_lut[mode];
+  int seg = mbmi->segment_id;
+  int ref_frame = mbmi->ref_frame[0];
+  int filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
 
-// Determine if we should skip MB loop filtering on a MB edge within
-// a superblock, the current condition is that MB loop filtering is
-// skipped only when both MBs do not use inner MB loop filtering, and
-// same motion vector with same reference frame
-static int sb_mb_lf_skip(const MODE_INFO *const mip0,
-                         const MODE_INFO *const mip1) {
-  const MB_MODE_INFO *mbmi0 = &mip0->mbmi;
-  const MB_MODE_INFO *mbmi1 = &mip0->mbmi;
-  return mb_lf_skip(mbmi0) && mb_lf_skip(mbmi1) &&
-         (mbmi0->ref_frame == mbmi1->ref_frame) &&
-         (mbmi0->mv[mbmi0->ref_frame].as_int ==
-          mbmi1->mv[mbmi1->ref_frame].as_int) &&
-         mbmi0->ref_frame != INTRA_FRAME;
+  if (filter_level) {
+    const int hev_index = filter_level >> 4;
+    lfi->mblim = lfi_n->mblim[filter_level];
+    lfi->blim = lfi_n->blim[filter_level];
+    lfi->lim = lfi_n->lim[filter_level];
+    lfi->hev_thr = lfi_n->hev_thr[hev_index];
+    return 1;
+  }
+  return 0;
 }
 
-void vp9_loop_filter_frame(VP9_COMMON *cm,
-                           MACROBLOCKD *xd,
-                           int frame_filter_level,
-                           int y_only,
-                           int dering) {
-  YV12_BUFFER_CONFIG *post = cm->frame_to_show;
-  loop_filter_info_n *lfi_n = &cm->lf_info;
-  struct loop_filter_info lfi;
-  const FRAME_TYPE frame_type = cm->frame_type;
-  int mb_row, mb_col;
-  uint8_t *y_ptr, *u_ptr, *v_ptr;
+static void filter_selectively_vert(uint8_t *s, int pitch,
+                                    unsigned int mask_16x16,
+                                    unsigned int mask_8x8,
+                                    unsigned int mask_4x4,
+                                    unsigned int mask_4x4_int,
+                                    const struct loop_filter_info *lfi) {
+  unsigned int mask;
 
-  /* Point at base of Mb MODE_INFO list */
-  const MODE_INFO *mode_info_context = cm->mi;
-  const int mis = cm->mode_info_stride;
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= 1) {
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim,
+                                   lfi->hev_thr, 1);
+        assert(!(mask_8x8 & 1));
+        assert(!(mask_4x4 & 1));
+        assert(!(mask_4x4_int & 1));
+      } else if (mask_8x8 & 1) {
+        vp9_mbloop_filter_vertical_edge(s, pitch, lfi->mblim, lfi->lim,
+                                        lfi->hev_thr, 1);
+        assert(!(mask_16x16 & 1));
+        assert(!(mask_4x4 & 1));
+      } else if (mask_4x4 & 1) {
+        vp9_loop_filter_vertical_edge(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, 1);
+        assert(!(mask_16x16 & 1));
+        assert(!(mask_8x8 & 1));
+      } else {
+        assert(0);
+      }
 
-  /* Initialize the loop filter for this frame. */
-  vp9_loop_filter_frame_init(cm, xd, frame_filter_level);
-  /* Set up the buffer pointers */
-  y_ptr = post->y_buffer;
-  if (y_only) {
-    u_ptr = 0;
-    v_ptr = 0;
-  } else {
-    u_ptr = post->u_buffer;
-    v_ptr = post->v_buffer;
+      if (mask_4x4_int & 1)
+        vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, 1);
+    }
+    s += 8;
+    lfi++;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
   }
+}
 
-  /* vp9_filter each macro block */
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-      const MB_PREDICTION_MODE mode = mode_info_context->mbmi.mode;
-      const int mode_index = lfi_n->mode_lf_lut[mode];
-      const int seg = mode_info_context->mbmi.segment_id;
-      const int ref_frame = mode_info_context->mbmi.ref_frame;
-      const int filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
-      if (filter_level) {
-        const int skip_lf = mb_lf_skip(&mode_info_context->mbmi);
-        const int tx_size = mode_info_context->mbmi.txfm_size;
-        if (cm->filter_type == NORMAL_LOOPFILTER) {
-          const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
-          lfi.mblim = lfi_n->mblim[filter_level];
-          lfi.blim = lfi_n->blim[filter_level];
-          lfi.lim = lfi_n->lim[filter_level];
-          lfi.hev_thr = lfi_n->hev_thr[hev_index];
+static void filter_selectively_horiz(uint8_t *s, int pitch,
+                                     unsigned int mask_16x16,
+                                     unsigned int mask_8x8,
+                                     unsigned int mask_4x4,
+                                     unsigned int mask_4x4_int,
+                                     int only_4x4_1,
+                                     const struct loop_filter_info *lfi) {
+  unsigned int mask;
 
-          if (mb_col > 0 &&
-              !((mb_col & 1) && mode_info_context->mbmi.sb_type &&
-                (sb_mb_lf_skip(mode_info_context - 1, mode_info_context) ||
-                 tx_size >= TX_32X32))
-              ) {
-            if (tx_size >= TX_16X16)
-              vp9_lpf_mbv_w(y_ptr, u_ptr, v_ptr, post->y_stride,
-                            post->uv_stride, &lfi);
-            else
-              vp9_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post->y_stride,
-                                  post->uv_stride, &lfi);
-          }
-          if (!skip_lf) {
-            if (tx_size >= TX_8X8) {
-              if (tx_size == TX_8X8 && (mode == I8X8_PRED || mode == SPLITMV))
-                vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
-                                      post->uv_stride, &lfi);
-              else
-                vp9_loop_filter_bv8x8(y_ptr, NULL, NULL, post->y_stride,
-                                      post->uv_stride, &lfi);
-            } else {
-              vp9_loop_filter_bv(y_ptr, u_ptr, v_ptr, post->y_stride,
-                                 post->uv_stride, &lfi);
-            }
-          }
-          /* don't apply across umv border */
-          if (mb_row > 0 &&
-              !((mb_row & 1) && mode_info_context->mbmi.sb_type &&
-                (sb_mb_lf_skip(mode_info_context - mis, mode_info_context) ||
-                tx_size >= TX_32X32))
-              ) {
-            if (tx_size >= TX_16X16)
-              vp9_lpf_mbh_w(y_ptr, u_ptr, v_ptr, post->y_stride,
-                            post->uv_stride, &lfi);
-            else
-              vp9_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post->y_stride,
-                                  post->uv_stride, &lfi);
-          }
-          if (!skip_lf) {
-            if (tx_size >= TX_8X8) {
-              if (tx_size == TX_8X8 && (mode == I8X8_PRED || mode == SPLITMV))
-                vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
-                                      post->uv_stride, &lfi);
-              else
-                vp9_loop_filter_bh8x8(y_ptr, NULL, NULL, post->y_stride,
-                                      post->uv_stride, &lfi);
-            } else {
-              vp9_loop_filter_bh(y_ptr, u_ptr, v_ptr, post->y_stride,
-                                 post->uv_stride, &lfi);
-            }
-          }
-#if CONFIG_LOOP_DERING
-          if (dering) {
-            if (mb_row && mb_row < cm->mb_rows - 1 &&
-                mb_col && mb_col < cm->mb_cols - 1) {
-              vp9_post_proc_down_and_across(y_ptr, y_ptr,
-                                            post->y_stride, post->y_stride,
-                                            16, 16, dering);
-              if (!y_only) {
-                vp9_post_proc_down_and_across(u_ptr, u_ptr,
-                                              post->uv_stride, post->uv_stride,
-                                              8, 8, dering);
-                vp9_post_proc_down_and_across(v_ptr, v_ptr,
-                                              post->uv_stride, post->uv_stride,
-                                              8, 8, dering);
-              }
-            } else {
-              // Adjust the filter so that no out-of-frame data is used.
-              uint8_t *dr_y = y_ptr, *dr_u = u_ptr, *dr_v = v_ptr;
-              int w_adjust = 0;
-              int h_adjust = 0;
-
-              if (mb_col == 0) {
-                dr_y += 2;
-                dr_u += 2;
-                dr_v += 2;
-                w_adjust += 2;
-              }
-              if (mb_col == cm->mb_cols - 1)
-                w_adjust += 2;
-              if (mb_row == 0) {
-                dr_y += 2 * post->y_stride;
-                dr_u += 2 * post->uv_stride;
-                dr_v += 2 * post->uv_stride;
-                h_adjust += 2;
-              }
-              if (mb_row == cm->mb_rows - 1)
-                h_adjust += 2;
-              vp9_post_proc_down_and_across_c(dr_y, dr_y,
-                                              post->y_stride, post->y_stride,
-                                              16 - w_adjust, 16 - h_adjust,
-                                              dering);
-              if (!y_only) {
-                vp9_post_proc_down_and_across_c(dr_u, dr_u,
-                                                post->uv_stride,
-                                                post->uv_stride,
-                                                8 - w_adjust, 8 - h_adjust,
-                                                dering);
-                vp9_post_proc_down_and_across_c(dr_v, dr_v,
-                                                post->uv_stride,
-                                                post->uv_stride,
-                                                8 - w_adjust, 8 - h_adjust,
-                                                dering);
-              }
-            }
-          }
-#endif
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= 1) {
+    if (mask & 1) {
+      if (!only_4x4_1) {
+        if (mask_16x16 & 1) {
+          vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, 1);
+          assert(!(mask_8x8 & 1));
+          assert(!(mask_4x4 & 1));
+          assert(!(mask_4x4_int & 1));
+        } else if (mask_8x8 & 1) {
+          vp9_mbloop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
+                                            lfi->hev_thr, 1);
+          assert(!(mask_16x16 & 1));
+          assert(!(mask_4x4 & 1));
+        } else if (mask_4x4 & 1) {
+          vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
+                                          lfi->hev_thr, 1);
+          assert(!(mask_16x16 & 1));
+          assert(!(mask_8x8 & 1));
         } else {
-          // FIXME: Not 8x8 aware
-          if (mb_col > 0 &&
-              !(skip_lf && mb_lf_skip(&mode_info_context[-1].mbmi)) &&
-              !((mb_col & 1) && mode_info_context->mbmi.sb_type))
-            vp9_loop_filter_simple_mbv(y_ptr, post->y_stride,
-                                       lfi_n->mblim[filter_level]);
-          if (!skip_lf)
-            vp9_loop_filter_simple_bv(y_ptr, post->y_stride,
-                                      lfi_n->blim[filter_level]);
-
-          /* don't apply across umv border */
-          if (mb_row > 0 &&
-              !(skip_lf && mb_lf_skip(&mode_info_context[-mis].mbmi)) &&
-              !((mb_row & 1) && mode_info_context->mbmi.sb_type))
-            vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,
-                                       lfi_n->mblim[filter_level]);
-          if (!skip_lf)
-            vp9_loop_filter_simple_bh(y_ptr, post->y_stride,
-                                      lfi_n->blim[filter_level]);
+          assert(0);
         }
       }
-      y_ptr += 16;
-      if (!y_only) {
-        u_ptr += 8;
-        v_ptr += 8;
-      }
-      mode_info_context++;     /* step to next MB */
+
+      if (mask_4x4_int & 1)
+        vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
+                                        lfi->lim, lfi->hev_thr, 1);
     }
-    y_ptr += post->y_stride  * 16 - post->y_width;
-    if (!y_only) {
-      u_ptr += post->uv_stride *  8 - post->uv_width;
-      v_ptr += post->uv_stride *  8 - post->uv_width;
-    }
-    mode_info_context++;         /* Skip border mb */
+    s += 8;
+    lfi++;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
   }
 }
 
+static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd,
+                               int plane, int mi_row, int mi_col) {
+  const int ss_x = xd->plane[plane].subsampling_x;
+  const int ss_y = xd->plane[plane].subsampling_y;
+  const int row_step = 1 << xd->plane[plane].subsampling_y;
+  const int col_step = 1 << xd->plane[plane].subsampling_x;
+  struct buf_2d * const dst = &xd->plane[plane].dst;
+  uint8_t* const dst0 = dst->buf;
+  MODE_INFO* const mi0 = xd->mode_info_context;
+  unsigned int mask_16x16[64 / MI_SIZE] = {0};
+  unsigned int mask_8x8[64 / MI_SIZE] = {0};
+  unsigned int mask_4x4[64 / MI_SIZE] = {0};
+  unsigned int mask_4x4_int[64 / MI_SIZE] = {0};
+  struct loop_filter_info lfi[64 / MI_SIZE][64 / MI_SIZE];
+  int r, c;
 
-void vp9_loop_filter_partial_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                   int default_filt_lvl) {
-  YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+  for (r = 0; r < 64 / MI_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+    unsigned int mask_16x16_c = 0;
+    unsigned int mask_8x8_c = 0;
+    unsigned int mask_4x4_c = 0;
+    unsigned int border_mask;
 
-  uint8_t *y_ptr;
-  int mb_row;
-  int mb_col;
-  int mb_cols = post->y_width  >> 4;
+    // Determine the vertical edges that need filtering
+    for (c = 0; c < 64 / MI_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
+      const MODE_INFO const *mi = xd->mode_info_context;
+      const int skip_this = mi[c].mbmi.mb_skip_coeff
+                            && mi[c].mbmi.ref_frame != INTRA_FRAME;
+      // left edge of current unit is block/partition edge -> no skip
+      const int block_edge_left = b_width_log2(mi[c].mbmi.sb_type) ?
+          !(c & ((1 << (b_width_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1;
+      const int skip_this_c = skip_this && !block_edge_left;
+      // top edge of current unit is block/partition edge -> no skip
+      const int block_edge_above = b_height_log2(mi[c].mbmi.sb_type) ?
+          !(r & ((1 << (b_height_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1;
+      const int skip_this_r = skip_this && !block_edge_above;
+      const TX_SIZE tx_size = plane ? get_uv_tx_size(&mi[c].mbmi)
+                                    : mi[c].mbmi.txfm_size;
+      const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
+      const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
 
-  int linestocopy, i;
+      // Filter level can vary per MI
+      if (!build_lfi(cm, &mi[c].mbmi,
+                     lfi[r] + (c >> xd->plane[plane].subsampling_x)))
+        continue;
 
-  loop_filter_info_n *lfi_n = &cm->lf_info;
-  struct loop_filter_info lfi;
+      // Build masks based on the transform size of each block
+      if (tx_size == TX_32X32) {
+        if (!skip_this_c && ((c >> ss_x) & 3) == 0) {
+          if (!skip_border_4x4_c)
+            mask_16x16_c |= 1 << (c >> ss_x);
+          else
+            mask_8x8_c |= 1 << (c >> ss_x);
+        }
+        if (!skip_this_r && ((r >> ss_y) & 3) == 0) {
+          if (!skip_border_4x4_r)
+            mask_16x16[r] |= 1 << (c >> ss_x);
+          else
+            mask_8x8[r] |= 1 << (c >> ss_x);
+        }
+      } else if (tx_size == TX_16X16) {
+        if (!skip_this_c && ((c >> ss_x) & 1) == 0) {
+          if (!skip_border_4x4_c)
+            mask_16x16_c |= 1 << (c >> ss_x);
+          else
+            mask_8x8_c |= 1 << (c >> ss_x);
+        }
+        if (!skip_this_r && ((r >> ss_y) & 1) == 0) {
+          if (!skip_border_4x4_r)
+            mask_16x16[r] |= 1 << (c >> ss_x);
+          else
+            mask_8x8[r] |= 1 << (c >> ss_x);
+        }
+      } else {
+        // force 8x8 filtering on 32x32 boundaries
+        if (!skip_this_c) {
+          if (tx_size == TX_8X8 || ((c >> ss_x) & 3) == 0)
+            mask_8x8_c |= 1 << (c >> ss_x);
+          else
+            mask_4x4_c |= 1 << (c >> ss_x);
+        }
 
-  int filter_level;
-  int alt_flt_enabled = xd->segmentation_enabled;
-  FRAME_TYPE frame_type = cm->frame_type;
+        if (!skip_this_r) {
+          if (tx_size == TX_8X8 || ((r >> ss_y) & 3) == 0)
+            mask_8x8[r] |= 1 << (c >> ss_x);
+          else
+            mask_4x4[r] |= 1 << (c >> ss_x);
+        }
 
-  const MODE_INFO *mode_info_context;
-
-  int lvl_seg[MAX_MB_SEGMENTS];
-
-  mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);
-
-  /* 3 is a magic number. 4 is probably magic too */
-  linestocopy = (post->y_height >> (4 + 3));
-
-  if (linestocopy < 1)
-    linestocopy = 1;
-
-  linestocopy <<= 4;
-
-  /* Note the baseline filter values for each segment */
-  /* See vp9_loop_filter_frame_init. Rather than call that for each change
-   * to default_filt_lvl, copy the relevant calculation here.
-   */
-  if (alt_flt_enabled) {
-    for (i = 0; i < MAX_MB_SEGMENTS; i++) {
-      if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
-        // Abs value
-        lvl_seg[i] = vp9_get_segdata(xd, i, SEG_LVL_ALT_LF);
-      } else {
-        // Delta Value
-        lvl_seg[i] = default_filt_lvl + vp9_get_segdata(xd, i, SEG_LVL_ALT_LF);
-        lvl_seg[i] = clamp(lvl_seg[i], 0, 63);
+        if (!skip_this && tx_size < TX_8X8 && !skip_border_4x4_c)
+          mask_4x4_int[r] |= 1 << (c >> ss_x);
       }
     }
+
+    // Disable filtering on the leftmost column
+    border_mask = ~(mi_col == 0);
+    filter_selectively_vert(dst->buf, dst->stride,
+                            mask_16x16_c & border_mask,
+                            mask_8x8_c & border_mask,
+                            mask_4x4_c & border_mask,
+                            mask_4x4_int[r], lfi[r]);
+    dst->buf += 8 * dst->stride;
+    xd->mode_info_context += cm->mode_info_stride * row_step;
   }
 
-  /* Set up the buffer pointers */
-  y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride;
+  // Now do horizontal pass
+  dst->buf = dst0;
+  xd->mode_info_context = mi0;
+  for (r = 0; r < 64 / MI_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+    const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
+    const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
 
-  /* vp9_filter each macro block */
-  for (mb_row = 0; mb_row < (linestocopy >> 4); mb_row++) {
-    for (mb_col = 0; mb_col < mb_cols; mb_col++) {
-      int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
-                     mode_info_context->mbmi.mode != I8X8_PRED &&
-                     mode_info_context->mbmi.mode != SPLITMV &&
-                     mode_info_context->mbmi.mb_skip_coeff);
+    filter_selectively_horiz(dst->buf, dst->stride,
+                             mask_16x16[r],
+                             mask_8x8[r],
+                             mask_4x4[r],
+                             mask_4x4_int_r, mi_row + r == 0, lfi[r]);
+    dst->buf += 8 * dst->stride;
+    xd->mode_info_context += cm->mode_info_stride * row_step;
+  }
+}
 
-      if (alt_flt_enabled)
-        filter_level = lvl_seg[mode_info_context->mbmi.segment_id];
-      else
-        filter_level = default_filt_lvl;
+void vp9_loop_filter_frame(VP9_COMMON *cm,
+                           MACROBLOCKD *xd,
+                           int frame_filter_level,
+                           int y_only) {
+  int mi_row, mi_col;
 
-      if (filter_level) {
-        if (cm->filter_type == NORMAL_LOOPFILTER) {
-          const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
-          lfi.mblim = lfi_n->mblim[filter_level];
-          lfi.blim = lfi_n->blim[filter_level];
-          lfi.lim = lfi_n->lim[filter_level];
-          lfi.hev_thr = lfi_n->hev_thr[hev_index];
+  // Initialize the loop filter for this frame.
+  vp9_loop_filter_frame_init(cm, xd, frame_filter_level);
 
-          if (mb_col > 0)
-            vp9_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 64 / MI_SIZE) {
+    MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride;
 
-          if (!skip_lf)
-            vp9_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 64 / MI_SIZE) {
+      int plane;
 
-          vp9_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-
-          if (!skip_lf)
-            vp9_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-        } else {
-          if (mb_col > 0)
-            vp9_loop_filter_simple_mbv (y_ptr, post->y_stride,
-                                        lfi_n->mblim[filter_level]);
-
-          if (!skip_lf)
-            vp9_loop_filter_simple_bv(y_ptr, post->y_stride,
-                                      lfi_n->blim[filter_level]);
-
-          vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,
-                                     lfi_n->mblim[filter_level]);
-
-          if (!skip_lf)
-            vp9_loop_filter_simple_bh(y_ptr, post->y_stride,
-                                      lfi_n->blim[filter_level]);
-        }
+      setup_dst_planes(xd, cm->frame_to_show, mi_row, mi_col);
+      for (plane = 0; plane < (y_only ? 1 : MAX_MB_PLANE); plane++) {
+        xd->mode_info_context = mi + mi_col;
+        filter_block_plane(cm, xd, plane, mi_row, mi_col);
       }
-
-      y_ptr += 16;
-      mode_info_context += 1;      /* step to next MB */
     }
-
-    y_ptr += post->y_stride  * 16 - post->y_width;
-    mode_info_context += 1;          /* Skip border mb */
   }
 }
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -16,12 +16,6 @@
 #include "vp9/common/vp9_blockd.h"
 
 #define MAX_LOOP_FILTER 63
-
-typedef enum {
-  NORMAL_LOOPFILTER = 0,
-  SIMPLE_LOOPFILTER = 1
-} LOOPFILTERTYPE;
-
 #define SIMD_WIDTH 16
 
 /* Need to align this structure so when it is declared and
@@ -36,8 +30,7 @@
                   lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
   DECLARE_ALIGNED(SIMD_WIDTH, unsigned char,
                   hev_thr[4][SIMD_WIDTH]);
-  unsigned char lvl[4][4][4];
-  unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1];
+  unsigned char lvl[MAX_MB_SEGMENTS][4][4];
   unsigned char mode_lf_lut[MB_MODE_COUNT];
 } loop_filter_info_n;
 
@@ -56,9 +49,6 @@
   void sym(uint8_t *y, uint8_t *u, uint8_t *v, \
            int ystride, int uv_stride, struct loop_filter_info *lfi)
 
-#define prototype_simple_loopfilter(sym) \
-  void sym(uint8_t *y, int ystride, const unsigned char *blimit)
-
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/vp9_loopfilter_x86.h"
 #endif
@@ -83,8 +73,7 @@
 void vp9_loop_filter_frame(struct VP9Common *cm,
                            struct macroblockd *mbd,
                            int filter_level,
-                           int y_only,
-                           int dering);
+                           int y_only);
 
 void vp9_loop_filter_partial_frame(struct VP9Common *cm,
                                    struct macroblockd *mbd,
--- a/vp9/common/vp9_loopfilter_filters.c
+++ b/vp9/common/vp9_loopfilter_filters.c
@@ -8,19 +8,16 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <stdlib.h>
 #include "vpx_config.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
 static INLINE int8_t signed_char_clamp(int t) {
-  t = (t < -128 ? -128 : t);
-  t = (t > 127 ? 127 : t);
-  return (int8_t) t;
+  return (int8_t)clamp(t, -128, 127);
 }
 
-
-/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
+// should we apply any filter at all: 11111111 yes, 00000000 no
 static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,
                                  uint8_t p3, uint8_t p2,
                                  uint8_t p1, uint8_t p0,
@@ -34,11 +31,10 @@
   mask |= (abs(q2 - q1) > limit) * -1;
   mask |= (abs(q3 - q2) > limit) * -1;
   mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-  mask = ~mask;
-  return mask;
+  return ~mask;
 }
 
-/* is there high variance internal edge ( 11111111 yes, 00000000 no) */
+// is there high edge variance internal edge: 11111111 yes, 00000000 no
 static INLINE int8_t hevmask(uint8_t thresh, uint8_t p1, uint8_t p0,
                              uint8_t q0, uint8_t q1) {
   int8_t hev = 0;
@@ -70,73 +66,59 @@
 
   *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
   *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
-  filter = filter1;
 
   // outer tap adjustments
-  filter += 1;
-  filter >>= 1;
-  filter &= ~hev;
+  filter = ((filter1 + 1) >> 1) & ~hev;
 
   *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
   *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
 }
 
-void vp9_loop_filter_horizontal_edge_c(uint8_t *s,
-                                       int p, /* pitch */
-                                       const unsigned char *blimit,
-                                       const unsigned char *limit,
-                                       const unsigned char *thresh,
+void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int p /* pitch */,
+                                       const uint8_t *blimit,
+                                       const uint8_t *limit,
+                                       const uint8_t *thresh,
                                        int count) {
-  int hev = 0; /* high edge variance */
-  int8_t mask = 0;
-  int i = 0;
+  int i;
 
-  /* loop filter designed to work using chars so that we can make maximum use
-   * of 8 bit simd instructions.
-   */
-  do {
-    mask = filter_mask(limit[0], blimit[0],
-                       s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
-                       s[0 * p], s[1 * p], s[2 * p], s[3 * p]);
-
-    hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
-
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p],  q1 = s[1 * p],  q2 = s[2 * p],  q3 = s[3 * p];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
     filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
-
     ++s;
-  } while (++i < count * 8);
+  }
 }
 
-void vp9_loop_filter_vertical_edge_c(uint8_t *s,
-                                     int p,
-                                     const unsigned char *blimit,
-                                     const unsigned char *limit,
-                                     const unsigned char *thresh,
+void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh,
                                      int count) {
-  int  hev = 0; /* high edge variance */
-  int8_t mask = 0;
-  int i = 0;
+  int i;
 
-  /* loop filter designed to work using chars so that we can make maximum use
-   * of 8 bit simd instructions.
-   */
-  do {
-    mask = filter_mask(limit[0], blimit[0],
-                       s[-4], s[-3], s[-2], s[-1],
-                       s[0], s[1], s[2], s[3]);
-
-    hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
-
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
     filter(mask, hev, s - 2, s - 1, s, s + 1);
-
-    s += p;
-  } while (++i < count * 8);
+    s += pitch;
+  }
 }
-static INLINE signed char flatmask4(uint8_t thresh,
-                                    uint8_t p3, uint8_t p2,
-                                    uint8_t p1, uint8_t p0,
-                                    uint8_t q0, uint8_t q1,
-                                    uint8_t q2, uint8_t q3) {
+
+static INLINE int8_t flatmask4(uint8_t thresh,
+                               uint8_t p3, uint8_t p2,
+                               uint8_t p1, uint8_t p0,
+                               uint8_t q0, uint8_t q1,
+                               uint8_t q2, uint8_t q3) {
   int8_t flat = 0;
   flat |= (abs(p1 - p0) > thresh) * -1;
   flat |= (abs(q1 - q0) > thresh) * -1;
@@ -144,8 +126,7 @@
   flat |= (abs(q0 - q2) > thresh) * -1;
   flat |= (abs(p3 - p0) > thresh) * -1;
   flat |= (abs(q3 - q0) > thresh) * -1;
-  flat = ~flat;
-  return flat;
+  return ~flat;
 }
 static INLINE signed char flatmask5(uint8_t thresh,
                                     uint8_t p4, uint8_t p3, uint8_t p2,
@@ -167,289 +148,64 @@
                             uint8_t *oq2, uint8_t *oq3) {
   // use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line
   if (flat && mask) {
-    const uint8_t p3 = *op3;
-    const uint8_t p2 = *op2;
-    const uint8_t p1 = *op1;
-    const uint8_t p0 = *op0;
-    const uint8_t q0 = *oq0;
-    const uint8_t q1 = *oq1;
-    const uint8_t q2 = *oq2;
-    const uint8_t q3 = *oq3;
+    const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
 
-    *op2 = (p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3;
-    *op1 = (p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3;
-    *op0 = (p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2 + 4) >> 3;
-    *oq0 = (p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3 + 4) >> 3;
-    *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3 + 4) >> 3;
-    *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3 + 4) >> 3;
+    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3);
+    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3);
+    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3);
+    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3);
   } else {
-    int8_t filter1, filter2;
-
-    const int8_t ps1 = (int8_t) *op1 ^ 0x80;
-    const int8_t ps0 = (int8_t) *op0 ^ 0x80;
-    const int8_t qs0 = (int8_t) *oq0 ^ 0x80;
-    const int8_t qs1 = (int8_t) *oq1 ^ 0x80;
-
-    // add outer taps if we have high edge variance
-    int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
-
-    // inner taps
-    filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
-
-    filter1 = signed_char_clamp(filter + 4) >> 3;
-    filter2 = signed_char_clamp(filter + 3) >> 3;
-
-    *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
-    *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
-    filter = filter1;
-
-    // outer tap adjustments
-    filter += 1;
-    filter >>= 1;
-    filter &= ~hev;
-
-    *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
-    *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
+    filter(mask, hev, op1,  op0, oq0, oq1);
   }
 }
 
-void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s,
-                                         int p,
-                                         const unsigned char *blimit,
-                                         const unsigned char *limit,
-                                         const unsigned char *thresh,
+void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int p,
+                                         const uint8_t *blimit,
+                                         const uint8_t *limit,
+                                         const uint8_t *thresh,
                                          int count) {
-  int8_t hev = 0; /* high edge variance */
-  int8_t mask = 0;
-  int8_t flat = 0;
-  int i = 0;
+  int i;
 
-  /* loop filter designed to work using chars so that we can make maximum use
-   * of 8 bit simd instructions.
-   */
-  do {
-    mask = filter_mask(limit[0], blimit[0],
-                       s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
-                       s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
 
-    hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
-
-    flat = flatmask4(1, s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
-                        s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
+    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
     mbfilter(mask, hev, flat,
              s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
              s,         s + 1 * p, s + 2 * p, s + 3 * p);
-
     ++s;
-  } while (++i < count * 8);
-
+  }
 }
 
-void vp9_mbloop_filter_vertical_edge_c(uint8_t *s,
-                                       int p,
-                                       const unsigned char *blimit,
-                                       const unsigned char *limit,
-                                       const unsigned char *thresh,
+void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch,
+                                       const uint8_t *blimit,
+                                       const uint8_t *limit,
+                                       const uint8_t *thresh,
                                        int count) {
-  int8_t hev = 0; /* high edge variance */
-  int8_t mask = 0;
-  int8_t flat = 0;
-  int i = 0;
+  int i;
 
-  do {
-    mask = filter_mask(limit[0], blimit[0],
-                       s[-4], s[-3], s[-2], s[-1],
-                       s[0], s[1], s[2], s[3]);
-
-    hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
-    flat = flatmask4(1,
-                    s[-4], s[-3], s[-2], s[-1],
-                    s[ 0], s[ 1], s[ 2], s[ 3]);
-    mbfilter(mask, hev, flat,
-             s - 4, s - 3, s - 2, s - 1,
-             s,     s + 1, s + 2, s + 3);
-    s += p;
-  } while (++i < count * 8);
-
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t hev = hevmask(thresh[0], p1, p0, q0, q1);
+    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    mbfilter(mask, hev, flat, s - 4, s - 3, s - 2, s - 1,
+                              s,     s + 1, s + 2, s + 3);
+    s += pitch;
+  }
 }
 
-/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
-static INLINE int8_t simple_filter_mask(uint8_t blimit,
-                                        uint8_t p1, uint8_t p0,
-                                        uint8_t q0, uint8_t q1) {
-  return (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= blimit) * -1;
-}
-
-static INLINE void simple_filter(int8_t mask,
-                                 uint8_t *op1, uint8_t *op0,
-                                 uint8_t *oq0, uint8_t *oq1) {
-  int8_t filter1, filter2;
-  const int8_t p1 = (int8_t) *op1 ^ 0x80;
-  const int8_t p0 = (int8_t) *op0 ^ 0x80;
-  const int8_t q0 = (int8_t) *oq0 ^ 0x80;
-  const int8_t q1 = (int8_t) *oq1 ^ 0x80;
-
-  int8_t filter = signed_char_clamp(p1 - q1);
-  filter = signed_char_clamp(filter + 3 * (q0 - p0));
-  filter &= mask;
-
-  // save bottom 3 bits so that we round one side +4 and the other +3
-  filter1 = signed_char_clamp(filter + 4) >> 3;
-  *oq0  = signed_char_clamp(q0 - filter1) ^ 0x80;
-
-  filter2 = signed_char_clamp(filter + 3) >> 3;
-  *op0 = signed_char_clamp(p0 + filter2) ^ 0x80;
-}
-
-void vp9_loop_filter_simple_horizontal_edge_c(uint8_t *s,
-                                              int p,
-                                              const unsigned char *blimit) {
-  int8_t mask = 0;
-  int i = 0;
-
-  do {
-    mask = simple_filter_mask(blimit[0],
-                              s[-2 * p], s[-1 * p],
-                              s[0 * p], s[1 * p]);
-    simple_filter(mask,
-                  s - 2 * p, s - 1 * p,
-                  s, s + 1 * p);
-    ++s;
-  } while (++i < 16);
-}
-
-void vp9_loop_filter_simple_vertical_edge_c(uint8_t *s,
-                                            int p,
-                                            const unsigned char *blimit) {
-  int8_t mask = 0;
-  int i = 0;
-
-  do {
-    mask = simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
-    simple_filter(mask, s - 2, s - 1, s, s + 1);
-    s += p;
-  } while (++i < 16);
-}
-
-/* Vertical MB Filtering */
-void vp9_loop_filter_mbv_c(uint8_t *y_ptr, uint8_t *u_ptr,
-                           uint8_t *v_ptr, int y_stride, int uv_stride,
-                           struct loop_filter_info *lfi) {
-  vp9_mbloop_filter_vertical_edge_c(y_ptr, y_stride,
-                                    lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_mbloop_filter_vertical_edge_c(u_ptr, uv_stride,
-                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_mbloop_filter_vertical_edge_c(v_ptr, uv_stride,
-                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-}
-
-/* Vertical B Filtering */
-void vp9_loop_filter_bv_c(uint8_t*y_ptr, uint8_t *u_ptr,
-                          uint8_t *v_ptr, int y_stride, int uv_stride,
-                          struct loop_filter_info *lfi) {
-  vp9_loop_filter_vertical_edge_c(y_ptr + 4, y_stride,
-                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_vertical_edge_c(y_ptr + 8, y_stride,
-                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_vertical_edge_c(y_ptr + 12, y_stride,
-                                  lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride,
-                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride,
-                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-/* Horizontal MB filtering */
-void vp9_loop_filter_mbh_c(uint8_t *y_ptr, uint8_t *u_ptr,
-                           uint8_t *v_ptr, int y_stride, int uv_stride,
-                           struct loop_filter_info *lfi) {
-  vp9_mbloop_filter_horizontal_edge_c(y_ptr, y_stride,
-                                      lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride,
-                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride,
-                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-}
-
-/* Horizontal B Filtering */
-void vp9_loop_filter_bh_c(uint8_t *y_ptr, uint8_t *u_ptr,
-                          uint8_t *v_ptr, int y_stride, int uv_stride,
-                          struct loop_filter_info *lfi) {
-  vp9_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride,
-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride,
-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
-  vp9_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride,
-                                    lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride,
-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride,
-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-void vp9_loop_filter_bh8x8_c(uint8_t *y_ptr, uint8_t *u_ptr,
-                             uint8_t *v_ptr, int y_stride, int uv_stride,
-                             struct loop_filter_info *lfi) {
-  vp9_mbloop_filter_horizontal_edge_c(
-    y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride,
-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride,
-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-void vp9_loop_filter_bhs_c(uint8_t *y_ptr, int y_stride,
-                           const unsigned char *blimit) {
-  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride,
-                                           y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride,
-                                           y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride,
-                                           y_stride, blimit);
-}
-
-void vp9_loop_filter_bv8x8_c(uint8_t *y_ptr, uint8_t *u_ptr,
-                             uint8_t *v_ptr, int y_stride, int uv_stride,
-                             struct loop_filter_info *lfi) {
-  vp9_mbloop_filter_vertical_edge_c(
-    y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride,
-                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride,
-                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);
-}
-
-void vp9_loop_filter_bvs_c(uint8_t *y_ptr, int y_stride,
-                           const unsigned char *blimit) {
-  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
-}
-
 static INLINE void wide_mbfilter(int8_t mask, uint8_t hev,
                                  uint8_t flat, uint8_t flat2,
                                  uint8_t *op7, uint8_t *op6, uint8_t *op5,
@@ -460,130 +216,65 @@
                                  uint8_t *oq7) {
   // use a 15 tap filter [1,1,1,1,1,1,1,2,1,1,1,1,1,1,1] for flat line
   if (flat2 && flat && mask) {
-    const uint8_t p7 = *op7;
-    const uint8_t p6 = *op6;
-    const uint8_t p5 = *op5;
-    const uint8_t p4 = *op4;
-    const uint8_t p3 = *op3;
-    const uint8_t p2 = *op2;
-    const uint8_t p1 = *op1;
-    const uint8_t p0 = *op0;
-    const uint8_t q0 = *oq0;
-    const uint8_t q1 = *oq1;
-    const uint8_t q2 = *oq2;
-    const uint8_t q3 = *oq3;
-    const uint8_t q4 = *oq4;
-    const uint8_t q5 = *oq5;
-    const uint8_t q6 = *oq6;
-    const uint8_t q7 = *oq7;
+    const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4,
+                  p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
 
-    *op6 = (p7 * 7 + p6 * 2 +
-            p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
-    *op5 = (p7 * 6 + p6 + p5 * 2 +
-            p4 + p3 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
-    *op4 = (p7 * 5 + p6 + p5 + p4 * 2 +
-            p3 + p2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
-    *op3 = (p7 * 4 + p6 + p5 + p4 + p3 * 2 +
-            p2 + p1 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
-    *op2 = (p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 +
-            p1 + p0 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
-    *op1 = (p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
-            p0 + q0 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
-    *op0 = (p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
-            q0 + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
-    *oq0 = (p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
-            q1 + q2 + q3 + q4 + q5 + q6 + q7 + 8) >> 4;
-    *oq1 = (p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
-            q2 + q3 + q4 + q5 + q6 + q7 * 2 + 8) >> 4;
-    *oq2 = (p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
-            q3 + q4 + q5 + q6 + q7 * 3 + 8) >> 4;
-    *oq3 = (p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 +
-            q4 + q5 + q6 + q7 * 4 + 8) >> 4;
-    *oq4 = (p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 +
-            q5 + q6 + q7 * 5 + 8) >> 4;
-    *oq5 = (p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 +
-            q6 + q7 * 6 + 8) >> 4;
-    *oq6 = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 +
-            q7 * 7 + 8) >> 4;
-  } else if (flat && mask) {
-    const uint8_t p3 = *op3;
-    const uint8_t p2 = *op2;
-    const uint8_t p1 = *op1;
-    const uint8_t p0 = *op0;
-    const uint8_t q0 = *oq0;
-    const uint8_t q1 = *oq1;
-    const uint8_t q2 = *oq2;
-    const uint8_t q3 = *oq3;
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3,
+                  q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
 
-    *op2 = (p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3;
-    *op1 = (p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3;
-    *op0 = (p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2 + 4) >> 3;
-    *oq0 = (p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3 + 4) >> 3;
-    *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3 + 4) >> 3;
-    *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3 + 4) >> 3;
+    *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0, 4);
+    *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1, 4);
+    *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2, 4);
+    *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3, 4);
+    *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4, 4);
+    *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5, 4);
+    *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+                              q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
+    *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
+    *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4);
+    *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4);
+    *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
+    *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
+    *oq5 = ROUND_POWER_OF_TWO(p1 + p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
+    *oq6 = ROUND_POWER_OF_TWO(p0 +
+                              q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
   } else {
-    int8_t filter1, filter2;
-
-    const int8_t ps1 = (int8_t) * op1 ^ 0x80;
-    const int8_t ps0 = (int8_t) * op0 ^ 0x80;
-    const int8_t qs0 = (int8_t) * oq0 ^ 0x80;
-    const int8_t qs1 = (int8_t) * oq1 ^ 0x80;
-
-    // add outer taps if we have high edge variance
-    int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
-
-    // inner taps
-    filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
-    filter1 = signed_char_clamp(filter + 4) >> 3;
-    filter2 = signed_char_clamp(filter + 3) >> 3;
-
-    *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
-    *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
-    filter = filter1;
-
-    // outer tap adjustments
-    filter += 1;
-    filter >>= 1;
-    filter &= ~hev;
-
-    *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
-    *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
+    mbfilter(mask, hev, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
   }
 }
 
-void vp9_mb_lpf_horizontal_edge_w
-(
-  unsigned char *s,
-  int p,
-  const unsigned char *blimit,
-  const unsigned char *limit,
-  const unsigned char *thresh,
-  int count
-) {
-  signed char hev = 0; /* high edge variance */
-  signed char mask = 0;
-  signed char flat = 0;
-  signed char flat2 = 0;
-  int i = 0;
+void vp9_mb_lpf_horizontal_edge_w(uint8_t *s, int p,
+                                 const uint8_t *blimit,
+                                 const uint8_t *limit,
+                                 const uint8_t *thresh,
+                                 int count) {
+  int i;
 
-  /* loop filter designed to work using chars so that we can make maximum use
-   * of 8 bit simd instructions.
-   */
-  do {
-    mask = filter_mask(limit[0], blimit[0],
-                       s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
-                       s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
+    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat2 = flatmask5(1,
+                             s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
+                             q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
 
-    hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
-
-    flat = flatmask4(1,
-                     s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
-                     s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
-
-    flat2 = flatmask5(1,
-                      s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], s[-1 * p],
-                      s[ 0 * p], s[ 4 * p], s[ 5 * p], s[ 6 * p], s[ 7 * p]);
-
     wide_mbfilter(mask, hev, flat, flat2,
                   s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
                   s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
@@ -591,71 +282,29 @@
                   s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
 
     ++s;
-  } while (++i < count * 8);
+  }
 }
-void vp9_mb_lpf_vertical_edge_w
-(
-  unsigned char *s,
-  int p,
-  const unsigned char *blimit,
-  const unsigned char *limit,
-  const unsigned char *thresh,
-  int count
-) {
-  signed char hev = 0; /* high edge variance */
-  signed char mask = 0;
-  signed char flat = 0;
-  signed char flat2 = 0;
-  int i = 0;
 
-  do {
-    mask = filter_mask(limit[0], blimit[0],
-                       s[-4], s[-3], s[-2], s[-1],
-                       s[0], s[1], s[2], s[3]);
+void vp9_mb_lpf_vertical_edge_w(uint8_t *s, int p,
+                                const uint8_t *blimit,
+                                const uint8_t *limit,
+                                const uint8_t *thresh,
+                                int count) {
+  int i;
 
-    hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
-    flat = flatmask4(1,
-                     s[-4], s[-3], s[-2], s[-1],
-                     s[ 0], s[ 1], s[ 2], s[ 3]);
-    flat2 = flatmask5(1,
-                     s[-8], s[-7], s[-6], s[-5], s[-1],
-                     s[ 0], s[ 4], s[ 5], s[ 6], s[ 7]);
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1],  q2 = s[2], q3 = s[3];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
+    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat2 = flatmask5(1, s[-8], s[-7], s[-6], s[-5], p0,
+                                   q0, s[4], s[5], s[6], s[7]);
 
     wide_mbfilter(mask, hev, flat, flat2,
-                  s - 8, s - 7, s - 6, s - 5,
-                  s - 4, s - 3, s - 2, s - 1,
-                  s,     s + 1, s + 2, s + 3,
-                  s + 4, s + 5, s + 6, s + 7);
+                  s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
+                  s,     s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
     s += p;
-  } while (++i < count * 8);
+  }
 }
-
-void vp9_lpf_mbv_w_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                   unsigned char *v_ptr, int y_stride, int uv_stride,
-                   struct loop_filter_info *lfi) {
-  vp9_mb_lpf_vertical_edge_w(y_ptr, y_stride,
-                                    lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_mbloop_filter_vertical_edge_c(u_ptr, uv_stride,
-                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_mbloop_filter_vertical_edge_c(v_ptr, uv_stride,
-                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-}
-void vp9_lpf_mbh_w_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                           unsigned char *v_ptr, int y_stride, int uv_stride,
-                           struct loop_filter_info *lfi) {
-  vp9_mb_lpf_horizontal_edge_w(y_ptr, y_stride,
-                                      lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-
-  if (u_ptr)
-    vp9_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride,
-                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-
-  if (v_ptr)
-    vp9_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride,
-                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);
-}
-
--- a/vp9/common/vp9_mbpitch.c
+++ b/vp9/common/vp9_mbpitch.c
@@ -11,105 +11,18 @@
 
 #include "vp9/common/vp9_blockd.h"
 
-typedef enum {
-  PRED = 0,
-  DEST = 1
-} BLOCKSET;
+void vp9_setup_block_dptrs(MACROBLOCKD *mb,
+                           int subsampling_x, int subsampling_y) {
+  int i;
 
-static void setup_block(BLOCKD *b,
-                        int mv_stride,
-                        uint8_t **base,
-                        uint8_t **base2,
-                        int stride,
-                        int offset,
-                        BLOCKSET bs) {
-  if (bs == DEST) {
-    b->dst_stride = stride;
-    b->dst = offset;
-    b->base_dst = base;
-  } else {
-    b->pre_stride = stride;
-    b->pre = offset;
-    b->base_pre = base;
-    b->base_second_pre = base2;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    mb->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC;
+    mb->plane[i].subsampling_x = i ? subsampling_x : 0;
+    mb->plane[i].subsampling_y = i ? subsampling_y : 0;
   }
-}
-
-static void setup_macroblock(MACROBLOCKD *xd, BLOCKSET bs) {
-  int block;
-
-  uint8_t **y, **u, **v;
-  uint8_t **y2 = NULL, **u2 = NULL, **v2 = NULL;
-  BLOCKD *blockd = xd->block;
-  int stride;
-
-  if (bs == DEST) {
-    y = &xd->dst.y_buffer;
-    u = &xd->dst.u_buffer;
-    v = &xd->dst.v_buffer;
-  } else {
-    y = &xd->pre.y_buffer;
-    u = &xd->pre.u_buffer;
-    v = &xd->pre.v_buffer;
-
-    y2 = &xd->second_pre.y_buffer;
-    u2 = &xd->second_pre.u_buffer;
-    v2 = &xd->second_pre.v_buffer;
-  }
-
-  stride = xd->dst.y_stride;
-  for (block = 0; block < 16; block++) { /* y blocks */
-    setup_block(&blockd[block], stride, y, y2, stride,
-                (block >> 2) * 4 * stride + (block & 3) * 4, bs);
-  }
-
-  stride = xd->dst.uv_stride;
-  for (block = 16; block < 20; block++) { /* U and V blocks */
-    setup_block(&blockd[block], stride, u, u2, stride,
-      ((block - 16) >> 1) * 4 * stride + (block & 1) * 4, bs);
-
-    setup_block(&blockd[block + 4], stride, v, v2, stride,
-      ((block - 16) >> 1) * 4 * stride + (block & 1) * 4, bs);
-  }
-}
-
-void vp9_setup_block_dptrs(MACROBLOCKD *xd) {
-  int r, c;
-  BLOCKD *blockd = xd->block;
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
-      blockd[r * 4 + c].diff = &xd->diff[r * 4 * 16 + c * 4];
-      blockd[r * 4 + c].predictor = xd->predictor + r * 4 * 16 + c * 4;
-    }
-  }
-
-  for (r = 0; r < 2; r++) {
-    for (c = 0; c < 2; c++) {
-      blockd[16 + r * 2 + c].diff = &xd->diff[256 + r * 4 * 8 + c * 4];
-      blockd[16 + r * 2 + c].predictor =
-        xd->predictor + 256 + r * 4 * 8 + c * 4;
-
-    }
-  }
-
-  for (r = 0; r < 2; r++) {
-    for (c = 0; c < 2; c++) {
-      blockd[20 + r * 2 + c].diff = &xd->diff[320 + r * 4 * 8 + c * 4];
-      blockd[20 + r * 2 + c].predictor =
-        xd->predictor + 320 + r * 4 * 8 + c * 4;
-
-    }
-  }
-
-  for (r = 0; r < 24; r++) {
-    blockd[r].qcoeff  = xd->qcoeff  + r * 16;
-    blockd[r].dqcoeff = xd->dqcoeff + r * 16;
-  }
-}
-
-void vp9_build_block_doffsets(MACROBLOCKD *xd) {
-  /* handle the destination pitch features */
-  setup_macroblock(xd, DEST);
-  setup_macroblock(xd, PRED);
+#if CONFIG_ALPHA
+  // TODO(jkoleszar): Using the Y w/h for now
+  mb->plane[3].subsampling_x = 0;
+  mb->plane[3].subsampling_y = 0;
+#endif
 }
--- a/vp9/common/vp9_modecont.c
+++ b/vp9/common/vp9_modecont.c
@@ -11,12 +11,13 @@
 
 #include "vp9/common/vp9_entropy.h"
 
-const int vp9_default_mode_contexts[INTER_MODE_CONTEXTS][4] = {
-  {1,       223,   1,    237},  // 0,0 best: Only candidate
-  {87,      166,   26,   219},  // 0,0 best: non zero candidates
-  {89,      67,    18,   125},  // 0,0 best: non zero candidates, split
-  {16,      141,   69,   226},  // strong nz candidate(s), no split
-  {35,      122,   14,   227},  // weak nz candidate(s), no split
-  {14,      122,   22,   164},  // strong nz candidate(s), split
-  {16,      70,    9,    183},  // weak nz candidate(s), split
+const vp9_prob vp9_default_inter_mode_probs[INTER_MODE_CONTEXTS]
+                                           [VP9_INTER_MODES - 1] = {
+  {2,       173,   34},  // 0 = both zero mv
+  {7,       145,   85},  // 1 = one zero mv + one a predicted mv
+  {7,       166,   63},  // 2 = two predicted mvs
+  {7,       94,    66},  // 3 = one predicted/zero and one new mv
+  {8,       64,    46},  // 4 = two new mvs
+  {17,      81,    31},  // 5 = one intra neighbour + x
+  {25,      29,    30},  // 6 = two intra neighbours
 };
--- a/vp9/common/vp9_modecont.h
+++ b/vp9/common/vp9_modecont.h
@@ -11,6 +11,9 @@
 #ifndef VP9_COMMON_VP9_MODECONT_H_
 #define VP9_COMMON_VP9_MODECONT_H_
 
-extern const int vp9_default_mode_contexts[INTER_MODE_CONTEXTS][4];
+#include "vp9/common/vp9_entropy.h"
+
+extern const int vp9_default_inter_mode_probs[INTER_MODE_CONTEXTS]
+                                             [VP9_INTER_MODES - 1];
 
 #endif  // VP9_COMMON_VP9_MODECONT_H_
--- a/vp9/common/vp9_modecontext.c
+++ b/vp9/common/vp9_modecontext.c
@@ -11,137 +11,118 @@
 
 #include "vp9/common/vp9_entropymode.h"
 
-const unsigned int vp9_kf_default_bmode_counts[VP9_KF_BINTRAMODES]
-                                              [VP9_KF_BINTRAMODES]
-                                              [VP9_KF_BINTRAMODES] = {
-  {
-    /*Above Mode :  0*/
-    { 43438,   2195,    470,    316,    615,    171,    217,    412,    124,    160, }, /* left_mode 0 */
-    {  5722,   2751,    296,    291,     81,     68,     80,    101,    100,    170, }, /* left_mode 1 */
-    {  1629,    201,    307,     25,     47,     16,     34,     72,     19,     28, }, /* left_mode 2 */
-    {   332,    266,     36,    500,     20,     65,     23,     14,    154,    106, }, /* left_mode 3 */
-    {   450,     97,     10,     24,    117,     10,      2,     12,      8,     71, }, /* left_mode 4 */
-    {   384,     49,     29,     44,     12,    162,     51,      5,     87,     42, }, /* left_mode 5 */
-    {   495,     53,    157,     27,     14,     57,    180,     17,     17,     34, }, /* left_mode 6 */
-    {   695,     64,     62,      9,     27,      5,      3,    147,     10,     26, }, /* left_mode 7 */
-    {   230,     54,     20,    124,     16,    125,     29,     12,    283,     37, }, /* left_mode 8 */
-    {   260,     87,     21,    120,     32,     16,     33,     16,     33,    203, }, /* left_mode 9 */
-  },
-  {
-    /*Above Mode :  1*/
-    {  3934,   2573,    355,    137,    128,     87,    133,    117,     37,     27, }, /* left_mode 0 */
-    {  1036,   1929,    278,    135,     27,     37,     48,     55,     41,     91, }, /* left_mode 1 */
-    {   223,    256,    253,     15,     13,      9,     28,     64,      3,      3, }, /* left_mode 2 */
-    {   120,    129,     17,    316,     15,     11,      9,      4,     53,     74, }, /* left_mode 3 */
-    {   129,     58,      6,     11,     38,      2,      0,      5,      2,     67, }, /* left_mode 4 */
-    {    53,     22,     11,     16,      8,     26,     14,      3,     19,     12, }, /* left_mode 5 */
-    {    59,     26,     61,     11,      4,      9,     35,     13,      8,      8, }, /* left_mode 6 */
-    {   101,     52,     40,      8,      5,      2,      8,     59,      2,     20, }, /* left_mode 7 */
-    {    48,     34,     10,     52,      8,     15,      6,      6,     63,     20, }, /* left_mode 8 */
-    {    96,     48,     22,     63,     11,     14,      5,      8,      9,     96, }, /* left_mode 9 */
-  },
-  {
-    /*Above Mode :  2*/
-    {   709,    461,    506,     36,     27,     33,    151,     98,     24,      6, }, /* left_mode 0 */
-    {   201,    375,    442,     27,     13,      8,     46,     58,      6,     19, }, /* left_mode 1 */
-    {   122,    140,    417,      4,     13,      3,     33,     59,      4,      2, }, /* left_mode 2 */
-    {    36,     17,     22,     16,      6,      8,     12,     17,      9,     21, }, /* left_mode 3 */
-    {    51,     15,      7,      1,     14,      0,      4,      5,      3,     22, }, /* left_mode 4 */
-    {    18,     11,     30,      9,      7,     20,     11,      5,      2,      6, }, /* left_mode 5 */
-    {    38,     21,    103,      9,      4,     12,     79,     13,      2,      5, }, /* left_mode 6 */
-    {    64,     17,     66,      2,     12,      4,      2,     65,      4,      5, }, /* left_mode 7 */
-    {    14,      7,      7,     16,      3,     11,      4,     13,     15,     16, }, /* left_mode 8 */
-    {    36,      8,     32,      9,      9,      4,     14,      7,      6,     24, }, /* left_mode 9 */
-  },
-  {
-    /*Above Mode :  3*/
-    {  1340,    173,     36,    119,     30,     10,     13,     10,     20,     26, }, /* left_mode 0 */
-    {   156,    293,     26,    108,      5,     16,      2,      4,     23,     30, }, /* left_mode 1 */
-    {    60,     34,     13,      7,      3,      3,      0,      8,      4,      5, }, /* left_mode 2 */
-    {    72,     64,      1,    235,      3,      9,      2,      7,     28,     38, }, /* left_mode 3 */
-    {    29,     14,      1,      3,      5,      0,      2,      2,      5,     13, }, /* left_mode 4 */
-    {    22,      7,      4,     11,      2,      5,      1,      2,      6,      4, }, /* left_mode 5 */
-    {    18,     14,      5,      6,      4,      3,     14,      0,      9,      2, }, /* left_mode 6 */
-    {    41,     10,      7,      1,      2,      0,      0,     10,      2,      1, }, /* left_mode 7 */
-    {    23,     19,      2,     33,      1,      5,      2,      0,     51,      8, }, /* left_mode 8 */
-    {    33,     26,      7,     53,      3,      9,      3,      3,      9,     19, }, /* left_mode 9 */
-  },
-  {
-    /*Above Mode :  4*/
-    {   410,    165,     43,     31,     66,     15,     30,     54,      8,     17, }, /* left_mode 0 */
-    {   115,     64,     27,     18,     30,      7,     11,     15,      4,     19, }, /* left_mode 1 */
-    {    31,     23,     25,      1,      7,      2,      2,     10,      0,      5, }, /* left_mode 2 */
-    {    17,      4,      1,      6,      8,      2,      7,      5,      5,     21, }, /* left_mode 3 */
-    {   120,     12,      1,      2,     83,      3,      0,      4,      1,     40, }, /* left_mode 4 */
-    {     4,      3,      1,      2,      1,      2,      5,      0,      3,      6, }, /* left_mode 5 */
-    {    10,      2,     13,      6,      6,      6,      8,      2,      4,      5, }, /* left_mode 6 */
-    {    58,     10,      5,      1,     28,      1,      1,     33,      1,      9, }, /* left_mode 7 */
-    {     8,      2,      1,      4,      2,      5,      1,      1,      2,     10, }, /* left_mode 8 */
-    {    76,      7,      5,      7,     18,      2,      2,      0,      5,     45, }, /* left_mode 9 */
-  },
-  {
-    /*Above Mode :  5*/
-    {   444,     46,     47,     20,     14,    110,     60,     14,     60,      7, }, /* left_mode 0 */
-    {    59,     57,     25,     18,      3,     17,     21,      6,     14,      6, }, /* left_mode 1 */
-    {    24,     17,     20,      6,      4,     13,      7,      2,      3,      2, }, /* left_mode 2 */
-    {    13,     11,      5,     14,      4,      9,      2,      4,     15,      7, }, /* left_mode 3 */
-    {     8,      5,      2,      1,      4,      0,      1,      1,      2,     12, }, /* left_mode 4 */
-    {    19,      5,      5,      7,      4,     40,      6,      3,     10,      4, }, /* left_mode 5 */
-    {    16,      5,      9,      1,      1,     16,     26,      2,     10,      4, }, /* left_mode 6 */
-    {    11,      4,      8,      1,      1,      4,      4,      5,      4,      1, }, /* left_mode 7 */
-    {    15,      1,      3,      7,      3,     21,      7,      1,     34,      5, }, /* left_mode 8 */
-    {    18,      5,      1,      3,      4,      3,      7,      1,      2,      9, }, /* left_mode 9 */
-  },
-  {
-    /*Above Mode :  6*/
-    {   476,    149,     94,     13,     14,     77,    291,     27,     23,      3, }, /* left_mode 0 */
-    {    79,     83,     42,     14,      2,     12,     63,      2,      4,     14, }, /* left_mode 1 */
-    {    43,     36,     55,      1,      3,      8,     42,     11,      5,      1, }, /* left_mode 2 */
-    {     9,      9,      6,     16,      1,      5,      6,      3,     11,     10, }, /* left_mode 3 */
-    {    10,      3,      1,      3,     10,      1,      0,      1,      1,      4, }, /* left_mode 4 */
-    {    14,      6,     15,      5,      1,     20,     25,      2,      5,      0, }, /* left_mode 5 */
-    {    28,      7,     51,      1,      0,      8,    127,      6,      2,      5, }, /* left_mode 6 */
-    {    13,      3,      3,      2,      3,      1,      2,      8,      1,      2, }, /* left_mode 7 */
-    {    10,      3,      3,      3,      3,      8,      2,      2,      9,      3, }, /* left_mode 8 */
-    {    13,      7,     11,      4,      0,      4,      6,      2,      5,      8, }, /* left_mode 9 */
-  },
-  {
-    /*Above Mode :  7*/
-    {   376,    135,    119,      6,     32,      8,     31,    224,      9,      3, }, /* left_mode 0 */
-    {    93,     60,     54,      6,     13,      7,      8,     92,      2,     12, }, /* left_mode 1 */
-    {    74,     36,     84,      0,      3,      2,      9,     67,      2,      1, }, /* left_mode 2 */
-    {    19,      4,      4,      8,      8,      2,      4,      7,      6,     16, }, /* left_mode 3 */
-    {    51,      7,      4,      1,     77,      3,      0,     14,      1,     15, }, /* left_mode 4 */
-    {     7,      7,      5,      7,      4,      7,      4,      5,      0,      3, }, /* left_mode 5 */
-    {    18,      2,     19,      2,      2,      4,     12,     11,      1,      2, }, /* left_mode 6 */
-    {   129,      6,     27,      1,     21,      3,      0,    189,      0,      6, }, /* left_mode 7 */
-    {     9,      1,      2,      8,      3,      7,      0,      5,      3,      3, }, /* left_mode 8 */
-    {    20,      4,      5,     10,      4,      2,      7,     17,      3,     16, }, /* left_mode 9 */
-  },
-  {
-    /*Above Mode :  8*/
-    {   617,     68,     34,     79,     11,     27,     25,     14,     75,     13, }, /* left_mode 0 */
-    {    51,     82,     21,     26,      6,     12,     13,      1,     26,     16, }, /* left_mode 1 */
-    {    29,      9,     12,     11,      3,      7,      1,     10,      2,      2, }, /* left_mode 2 */
-    {    17,     19,     11,     74,      4,      3,      2,      0,     58,     13, }, /* left_mode 3 */
-    {    10,      1,      1,      3,      4,      1,      0,      2,      1,      8, }, /* left_mode 4 */
-    {    14,      4,      5,      5,      1,     13,      2,      0,     27,      8, }, /* left_mode 5 */
-    {    10,      3,      5,      4,      1,      7,      6,      4,      5,      1, }, /* left_mode 6 */
-    {    10,      2,      6,      2,      1,      1,      1,      4,      2,      1, }, /* left_mode 7 */
-    {    14,      8,      5,     23,      2,     12,      6,      2,    117,      5, }, /* left_mode 8 */
-    {     9,      6,      2,     19,      1,      6,      3,      2,      9,      9, }, /* left_mode 9 */
-  },
-  {
-    /*Above Mode :  9*/
-    {   680,     73,     22,     38,     42,      5,     11,      9,      6,     28, }, /* left_mode 0 */
-    {   113,    112,     21,     22,     10,      2,      8,      4,      6,     42, }, /* left_mode 1 */
-    {    44,     20,     24,      6,      5,      4,      3,      3,      1,      2, }, /* left_mode 2 */
-    {    40,     23,      7,     71,      5,      2,      4,      1,      7,     22, }, /* left_mode 3 */
-    {    85,      9,      4,      4,     17,      2,      0,      3,      2,     23, }, /* left_mode 4 */
-    {    13,      4,      2,      6,      1,      7,      0,      1,      7,      6, }, /* left_mode 5 */
-    {    26,      6,      8,      3,      2,      3,      8,      1,      5,      4, }, /* left_mode 6 */
-    {    54,      8,      9,      6,      7,      0,      1,     11,      1,      3, }, /* left_mode 7 */
-    {     9,     10,      4,     13,      2,      5,      4,      2,     14,      8, }, /* left_mode 8 */
-    {    92,      9,      5,     19,     15,      3,      3,      1,      6,     58, }, /* left_mode 9 */
-  },
+const vp9_prob vp9_kf_default_bmode_probs[VP9_INTRA_MODES]
+                                         [VP9_INTRA_MODES]
+                                         [VP9_INTRA_MODES - 1] = {
+  { /* above = dc */
+    { 137,  30,  42, 148, 151, 207,  70,  52,  91 } /* left = dc */,
+    {  92,  45, 102, 136, 116, 180,  74,  90, 100 } /* left = v */,
+    {  73,  32,  19, 187, 222, 215,  46,  34, 100 } /* left = h */,
+    {  91,  30,  32, 116, 121, 186,  93,  86,  94 } /* left = d45 */,
+    {  72,  35,  36, 149,  68, 206,  68,  63, 105 } /* left = d135 */,
+    {  73,  31,  28, 138,  57, 124,  55, 122, 151 } /* left = d117 */,
+    {  67,  23,  21, 140, 126, 197,  40,  37, 171 } /* left = d153 */,
+    {  86,  27,  28, 128, 154, 212,  45,  43,  53 } /* left = d27 */,
+    {  74,  32,  27, 107,  86, 160,  63, 134, 102 } /* left = d63 */,
+    {  59,  67,  44, 140, 161, 202,  78,  67, 119 } /* left = tm */
+  }, { /* above = v */
+    {  63,  36, 126, 146, 123, 158,  60,  90,  96 } /* left = dc */,
+    {  43,  46, 168, 134, 107, 128,  69, 142,  92 } /* left = v */,
+    {  44,  29,  68, 159, 201, 177,  50,  57,  77 } /* left = h */,
+    {  58,  38,  76, 114,  97, 172,  78, 133,  92 } /* left = d45 */,
+    {  46,  41,  76, 140,  63, 184,  69, 112,  57 } /* left = d135 */,
+    {  38,  32,  85, 140,  46, 112,  54, 151, 133 } /* left = d117 */,
+    {  39,  27,  61, 131, 110, 175,  44,  75, 136 } /* left = d153 */,
+    {  52,  30,  74, 113, 130, 175,  51,  64,  58 } /* left = d27 */,
+    {  47,  35,  80, 100,  74, 143,  64, 163,  74 } /* left = d63 */,
+    {  36,  61, 116, 114, 128, 162,  80, 125,  82 } /* left = tm */
+  }, { /* above = h */
+    {  82,  26,  26, 171, 208, 204,  44,  32, 105 } /* left = dc */,
+    {  55,  44,  68, 166, 179, 192,  57,  57, 108 } /* left = v */,
+    {  42,  26,  11, 199, 241, 228,  23,  15,  85 } /* left = h */,
+    {  68,  42,  19, 131, 160, 199,  55,  52,  83 } /* left = d45 */,
+    {  58,  50,  25, 139, 115, 232,  39,  52, 118 } /* left = d135 */,
+    {  50,  35,  33, 153, 104, 162,  64,  59, 131 } /* left = d117 */,
+    {  44,  24,  16, 150, 177, 202,  33,  19, 156 } /* left = d153 */,
+    {  55,  27,  12, 153, 203, 218,  26,  27,  49 } /* left = d27 */,
+    {  53,  49,  21, 110, 116, 168,  59,  80,  76 } /* left = d63 */,
+    {  38,  72,  19, 168, 203, 212,  50,  50, 107 } /* left = tm */
+  }, { /* above = d45 */
+    { 103,  26,  36, 129, 132, 201,  83,  80,  93 } /* left = dc */,
+    {  59,  38,  83, 112, 103, 162,  98, 136,  90 } /* left = v */,
+    {  62,  30,  23, 158, 200, 207,  59,  57,  50 } /* left = h */,
+    {  67,  30,  29,  84,  86, 191, 102,  91,  59 } /* left = d45 */,
+    {  60,  32,  33, 112,  71, 220,  64,  89, 104 } /* left = d135 */,
+    {  53,  26,  34, 130,  56, 149,  84, 120, 103 } /* left = d117 */,
+    {  53,  21,  23, 133, 109, 210,  56,  77, 172 } /* left = d153 */,
+    {  77,  19,  29, 112, 142, 228,  55,  66,  36 } /* left = d27 */,
+    {  61,  29,  29,  93,  97, 165,  83, 175, 162 } /* left = d63 */,
+    {  47,  47,  43, 114, 137, 181, 100,  99,  95 } /* left = tm */
+  }, { /* above = d135 */
+    {  69,  23,  29, 128,  83, 199,  46,  44, 101 } /* left = dc */,
+    {  53,  40,  55, 139,  69, 183,  61,  80, 110 } /* left = v */,
+    {  40,  29,  19, 161, 180, 207,  43,  24,  91 } /* left = h */,
+    {  60,  34,  19, 105,  61, 198,  53,  64,  89 } /* left = d45 */,
+    {  52,  31,  22, 158,  40, 209,  58,  62,  89 } /* left = d135 */,
+    {  44,  31,  29, 147,  46, 158,  56, 102, 198 } /* left = d117 */,
+    {  35,  19,  12, 135,  87, 209,  41,  45, 167 } /* left = d153 */,
+    {  55,  25,  21, 118,  95, 215,  38,  39,  66 } /* left = d27 */,
+    {  51,  38,  25, 113,  58, 164,  70,  93,  97 } /* left = d63 */,
+    {  47,  54,  34, 146, 108, 203,  72, 103, 151 } /* left = tm */
+  }, { /* above = d117 */
+    {  64,  19,  37, 156,  66, 138,  49,  95, 133 } /* left = dc */,
+    {  46,  27,  80, 150,  55, 124,  55, 121, 135 } /* left = v */,
+    {  36,  23,  27, 165, 149, 166,  54,  64, 118 } /* left = h */,
+    {  53,  21,  36, 131,  63, 163,  60, 109,  81 } /* left = d45 */,
+    {  40,  26,  35, 154,  40, 185,  51,  97, 123 } /* left = d135 */,
+    {  35,  19,  34, 179,  19,  97,  48, 129, 124 } /* left = d117 */,
+    {  36,  20,  26, 136,  62, 164,  33,  77, 154 } /* left = d153 */,
+    {  45,  18,  32, 130,  90, 157,  40,  79,  91 } /* left = d27 */,
+    {  45,  26,  28, 129,  45, 129,  49, 147, 123 } /* left = d63 */,
+    {  38,  44,  51, 136,  74, 162,  57,  97, 121 } /* left = tm */
+  }, { /* above = d153 */
+    {  75,  17,  22, 136, 138, 185,  32,  34, 166 } /* left = dc */,
+    {  56,  39,  58, 133, 117, 173,  48,  53, 187 } /* left = v */,
+    {  35,  21,  12, 161, 212, 207,  20,  23, 145 } /* left = h */,
+    {  56,  29,  19, 117, 109, 181,  55,  68, 112 } /* left = d45 */,
+    {  47,  29,  17, 153,  64, 220,  59,  51, 114 } /* left = d135 */,
+    {  46,  16,  24, 136,  76, 147,  41,  64, 172 } /* left = d117 */,
+    {  34,  17,  11, 108, 152, 187,  13,  15, 209 } /* left = d153 */,
+    {  51,  24,  14, 115, 133, 209,  32,  26, 104 } /* left = d27 */,
+    {  55,  30,  18, 122,  79, 179,  44,  88, 116 } /* left = d63 */,
+    {  37,  49,  25, 129, 168, 164,  41,  54, 148 } /* left = tm */
+  }, { /* above = d27 */
+    {  82,  22,  32, 127, 143, 213,  39,  41,  70 } /* left = dc */,
+    {  62,  44,  61, 123, 105, 189,  48,  57,  64 } /* left = v */,
+    {  47,  25,  17, 175, 222, 220,  24,  30,  86 } /* left = h */,
+    {  68,  36,  17, 106, 102, 206,  59,  74,  74 } /* left = d45 */,
+    {  57,  39,  23, 151,  68, 216,  55,  63,  58 } /* left = d135 */,
+    {  49,  30,  35, 141,  70, 168,  82,  40, 115 } /* left = d117 */,
+    {  51,  25,  15, 136, 129, 202,  38,  35, 139 } /* left = d153 */,
+    {  68,  26,  16, 111, 141, 215,  29,  28,  28 } /* left = d27 */,
+    {  59,  39,  19, 114,  75, 180,  77, 104,  42 } /* left = d63 */,
+    {  40,  61,  26, 126, 152, 206,  61,  59,  93 } /* left = tm */
+  }, { /* above = d63 */
+    {  78,  23,  39, 111, 117, 170,  74, 124,  94 } /* left = dc */,
+    {  48,  34,  86, 101,  92, 146,  78, 179, 134 } /* left = v */,
+    {  47,  22,  24, 138, 187, 178,  68,  69,  59 } /* left = h */,
+    {  56,  25,  33, 105, 112, 187,  95, 177, 129 } /* left = d45 */,
+    {  48,  31,  27, 114,  63, 183,  82, 116,  56 } /* left = d135 */,
+    {  43,  28,  37, 121,  63, 123,  61, 192, 169 } /* left = d117 */,
+    {  42,  17,  24, 109,  97, 177,  56,  76, 122 } /* left = d153 */,
+    {  58,  18,  28, 105, 139, 182,  70,  92,  63 } /* left = d27 */,
+    {  46,  23,  32,  74,  86, 150,  67, 183,  88 } /* left = d63 */,
+    {  36,  38,  48,  92, 122, 165,  88, 137,  91 } /* left = tm */
+  }, { /* above = tm */
+    {  65,  70,  60, 155, 159, 199,  61,  60,  81 } /* left = dc */,
+    {  44,  78, 115, 132, 119, 173,  71, 112,  93 } /* left = v */,
+    {  39,  38,  21, 184, 227, 206,  42,  32,  64 } /* left = h */,
+    {  58,  47,  36, 124, 137, 193,  80,  82,  78 } /* left = d45 */,
+    {  49,  50,  35, 144,  95, 205,  63,  78,  59 } /* left = d135 */,
+    {  41,  53,  52, 148,  71, 142,  65, 128,  51 } /* left = d117 */,
+    {  40,  36,  28, 143, 143, 202,  40,  55, 137 } /* left = d153 */,
+    {  52,  34,  29, 129, 183, 227,  42,  35,  43 } /* left = d27 */,
+    {  42,  44,  44, 104, 105, 164,  64, 130,  80 } /* left = d63 */,
+    {  43,  81,  53, 140, 169, 204,  68,  84,  72 } /* left = tm */
+  }
 };
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -11,35 +11,34 @@
 #include "vp9/common/vp9_mvref_common.h"
 
 #define MVREF_NEIGHBOURS 8
-
-static int mb_mv_ref_search[MVREF_NEIGHBOURS][2] = {
-    {0, -1}, {-1, 0}, {-1, -1}, {0, -2},
-    {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}
+static int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = {
+  // SB4X4
+  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
+  // SB4X8
+  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
+  // SB8X4
+  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
+  // SB8X8
+  {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
+  // SB8X16
+  {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}},
+  // SB16X8
+  {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}},
+  // SB16X16
+  {{0, -1}, {-1, 0}, {1, -1}, {-1, 1}, {-1, -1}, {0, -3}, {-3, 0}, {-3, -3}},
+  // SB16X32
+  {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}},
+  // SB32X16
+  {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}},
+  // SB32X32
+  {{1, -1}, {-1, 1}, {2, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {-3, -3}},
+  // SB32X64
+  {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
+  // SB64X32
+  {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}},
+  // SB64X64
+  {{3, -1}, {-1, 3}, {4, -1}, {-1, 4}, {-1, -1}, {0, -1}, {-1, 0}, {6, -1}}
 };
-
-static int mb_ref_distance_weight[MVREF_NEIGHBOURS] =
-  { 3, 3, 2, 1, 1, 1, 1, 1 };
-
-static int sb_mv_ref_search[MVREF_NEIGHBOURS][2] = {
-    {0, -1}, {-1, 0}, {1, -1}, {-1, 1},
-    {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}
-};
-
-static int sb_ref_distance_weight[MVREF_NEIGHBOURS] =
-  { 3, 3, 2, 2, 2, 1, 1, 1 };
-
-
-
-static int sb64_mv_ref_search[MVREF_NEIGHBOURS][2] = {
-    {0, -1}, {-1, 0}, {1, -1}, {-1, 1},
-    {2, -1}, {-1, 2}, {3, -1}, {-1,-1}
-};
-
-static int sb64_ref_distance_weight[MVREF_NEIGHBOURS] =
-  { 1, 1, 1, 1, 1, 1, 1, 1 };
-
-
-
 // clamp_mv_ref
 #define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
 
@@ -50,15 +49,21 @@
                                        xd->mb_to_bottom_edge + MV_BORDER);
 }
 
-// Gets a candidate refenence motion vector from the given mode info
+// Gets a candidate reference motion vector from the given mode info
 // structure if one exists that matches the given reference frame.
 static int get_matching_candidate(const MODE_INFO *candidate_mi,
                                   MV_REFERENCE_FRAME ref_frame,
-                                  int_mv *c_mv) {
-  if (ref_frame == candidate_mi->mbmi.ref_frame) {
-    c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
-  } else if (ref_frame == candidate_mi->mbmi.second_ref_frame) {
-    c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
+                                  int_mv *c_mv, int block_idx) {
+  if (ref_frame == candidate_mi->mbmi.ref_frame[0]) {
+    if (block_idx >= 0 && candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8)
+      c_mv->as_int = candidate_mi->bmi[block_idx].as_mv[0].as_int;
+    else
+      c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
+  } else if (ref_frame == candidate_mi->mbmi.ref_frame[1]) {
+    if (block_idx >= 0 && candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8)
+      c_mv->as_int = candidate_mi->bmi[block_idx].as_mv[1].as_int;
+    else
+      c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
   } else {
     return 0;
   }
@@ -66,7 +71,7 @@
   return 1;
 }
 
-// Gets candidate refenence motion vector(s) from the given mode info
+// Gets candidate reference motion vector(s) from the given mode info
 // structure if they exists and do NOT match the given reference frame.
 static void get_non_matching_candidates(const MODE_INFO *candidate_mi,
                                         MV_REFERENCE_FRAME ref_frame,
@@ -81,18 +86,18 @@
   *c2_ref_frame = INTRA_FRAME;
 
   // If first candidate not valid neither will be.
-  if (candidate_mi->mbmi.ref_frame > INTRA_FRAME) {
+  if (candidate_mi->mbmi.ref_frame[0] > INTRA_FRAME) {
     // First candidate
-    if (candidate_mi->mbmi.ref_frame != ref_frame) {
-      *c_ref_frame = candidate_mi->mbmi.ref_frame;
+    if (candidate_mi->mbmi.ref_frame[0] != ref_frame) {
+      *c_ref_frame = candidate_mi->mbmi.ref_frame[0];
       c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
     }
 
     // Second candidate
-    if ((candidate_mi->mbmi.second_ref_frame > INTRA_FRAME) &&
-        (candidate_mi->mbmi.second_ref_frame != ref_frame) &&
+    if ((candidate_mi->mbmi.ref_frame[1] > INTRA_FRAME) &&
+        (candidate_mi->mbmi.ref_frame[1] != ref_frame) &&
         (candidate_mi->mbmi.mv[1].as_int != candidate_mi->mbmi.mv[0].as_int)) {
-      *c2_ref_frame = candidate_mi->mbmi.second_ref_frame;
+      *c2_ref_frame = candidate_mi->mbmi.ref_frame[1];
       c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
     }
   }
@@ -103,10 +108,6 @@
 static void scale_mv(MACROBLOCKD *xd, MV_REFERENCE_FRAME this_ref_frame,
                      MV_REFERENCE_FRAME candidate_ref_frame,
                      int_mv *candidate_mv, int *ref_sign_bias) {
-  // int frame_distances[MAX_REF_FRAMES];
-  // int last_distance = 1;
-  // int gf_distance = xd->frames_since_golden;
-  // int arf_distance = xd->frames_till_alt_ref_frame;
 
   // Sign inversion where appropriate.
   if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) {
@@ -113,135 +114,35 @@
     candidate_mv->as_mv.row = -candidate_mv->as_mv.row;
     candidate_mv->as_mv.col = -candidate_mv->as_mv.col;
   }
-
-  /*
-  // Scale based on frame distance if the reference frames not the same.
-  frame_distances[INTRA_FRAME] = 1;   // should never be used
-  frame_distances[LAST_FRAME] = 1;
-  frame_distances[GOLDEN_FRAME] =
-    (xd->frames_since_golden) ? xd->frames_si nce_golden : 1;
-  frame_distances[ALTREF_FRAME] =
-    (xd->frames_till_alt_ref_frame) ? xd->frames_till_alt_ref_frame : 1;
-
-  if (frame_distances[this_ref_frame] &&
-      frame_distances[candidate_ref_frame]) {
-    candidate_mv->as_mv.row =
-      (short)(((int)(candidate_mv->as_mv.row) *
-               frame_distances[this_ref_frame]) /
-              frame_distances[candidate_ref_frame]);
-
-    candidate_mv->as_mv.col =
-      (short)(((int)(candidate_mv->as_mv.col) *
-               frame_distances[this_ref_frame]) /
-              frame_distances[candidate_ref_frame]);
-  }
-  */
 }
 
-/*
-// Adds a new candidate reference vector to the sorted list.
-// If it is a repeat the weight of the existing entry is increased
-// and the order of the list is resorted.
-// This method of add plus sort has been deprecated for now as there is a
-// further sort of the best candidates in vp9_find_best_ref_mvs() and the
-// incremental benefit of both is small. If the decision is made to remove
-// the sort in vp9_find_best_ref_mvs() for performance reasons then it may be
-// worth re-instating some sort of list reordering by weight here.
-//
-static void addmv_and_shuffle(
-  int_mv *mv_list,
-  int *mv_scores,
-  int *refmv_count,
-  int_mv candidate_mv,
-  int weight
-) {
-
-  int i;
-  int insert_point;
-  int duplicate_found = FALSE;
-
-  // Check for duplicates. If there is one increase its score.
-  // We only compare vs the current top candidates.
-  insert_point = (*refmv_count < (MAX_MV_REF_CANDIDATES - 1))
-                 ? *refmv_count : (MAX_MV_REF_CANDIDATES - 1);
-
-  i = insert_point;
-  if (*refmv_count > i)
-    i++;
-  while (i > 0) {
-    i--;
-    if (candidate_mv.as_int == mv_list[i].as_int) {
-      duplicate_found = TRUE;
-      mv_scores[i] += weight;
-      break;
-    }
-  }
-
-  // If no duplicate and the new candidate is good enough then add it.
-  if (!duplicate_found ) {
-    if (weight > mv_scores[insert_point]) {
-      mv_list[insert_point].as_int = candidate_mv.as_int;
-      mv_scores[insert_point] = weight;
-      i = insert_point;
-    }
-    (*refmv_count)++;
-  }
-
-  // Reshuffle the list so that highest scoring mvs at the top.
-  while (i > 0) {
-    if (mv_scores[i] > mv_scores[i-1]) {
-      int tmp_score = mv_scores[i-1];
-      int_mv tmp_mv = mv_list[i-1];
-
-      mv_scores[i-1] = mv_scores[i];
-      mv_list[i-1] = mv_list[i];
-      mv_scores[i] = tmp_score;
-      mv_list[i] = tmp_mv;
-      i--;
-    } else
-      break;
-  }
-}
-*/
-
-// Adds a new candidate reference vector to the list.
-// The mv is thrown out if it is already in the list.
-// Unlike the addmv_and_shuffle() this does not reorder the list
-// but assumes that candidates are added in the order most likely to
-// match distance and reference frame bias.
+// Add a candidate mv.
+// Discard if it has already been seen.
 static void add_candidate_mv(int_mv *mv_list,  int *mv_scores,
                              int *candidate_count, int_mv candidate_mv,
                              int weight) {
-  int i;
-
-  // Make sure we dont insert off the end of the list
-  const int insert_point = MIN(*candidate_count, MAX_MV_REF_CANDIDATES - 1);
-
-  // Look for duplicates
-  for (i = 0; i <= insert_point; ++i) {
-    if (candidate_mv.as_int == mv_list[i].as_int)
-      break;
+  if (*candidate_count == 0) {
+    mv_list[0].as_int = candidate_mv.as_int;
+    mv_scores[0] = weight;
+    *candidate_count += 1;
+  } else if ((*candidate_count == 1) &&
+             (candidate_mv.as_int != mv_list[0].as_int)) {
+    mv_list[1].as_int = candidate_mv.as_int;
+    mv_scores[1] = weight;
+    *candidate_count += 1;
   }
-
-  // Add the candidate. If the list is already full it is only desirable that
-  // it should overwrite if it has a higher weight than the last entry.
-  if (i >= insert_point && weight > mv_scores[insert_point]) {
-    mv_list[insert_point].as_int = candidate_mv.as_int;
-    mv_scores[insert_point] = weight;
-    *candidate_count += (*candidate_count < MAX_MV_REF_CANDIDATES);
-  }
 }
 
-// This function searches the neighbourhood of a given MB/SB and populates a
-// list of candidate reference vectors.
+// This function searches the neighbourhood of a given MB/SB
+// to try and find candidate reference vectors.
 //
-void vp9_find_mv_refs(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
-                      MODE_INFO *lf_here, MV_REFERENCE_FRAME ref_frame,
-                      int_mv *mv_ref_list, int *ref_sign_bias) {
+void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
+                          MODE_INFO *lf_here, MV_REFERENCE_FRAME ref_frame,
+                          int_mv *mv_ref_list, int *ref_sign_bias,
+                          int block_idx) {
   int i;
   MODE_INFO *candidate_mi;
   MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-  int_mv candidate_mvs[MAX_MV_REF_CANDIDATES];
   int_mv c_refmv;
   int_mv c2_refmv;
   MV_REFERENCE_FRAME c_ref_frame;
@@ -250,110 +151,119 @@
   int refmv_count = 0;
   int split_count = 0;
   int (*mv_ref_search)[2];
-  int *ref_distance_weight;
-  int zero_seen = FALSE;
-  const int mb_col = (-xd->mb_to_left_edge) >> 7;
+  const int mi_col = get_mi_col(xd);
+  const int mi_row = get_mi_row(xd);
+  int intra_count = 0;
+  int zero_count = 0;
+  int newmv_count = 0;
+  int x_idx = 0, y_idx = 0;
 
   // Blank the reference vector lists and other local structures.
   vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES);
-  vpx_memset(candidate_mvs, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES);
   vpx_memset(candidate_scores, 0, sizeof(candidate_scores));
 
-  if (mbmi->sb_type == BLOCK_SIZE_SB64X64) {
-    mv_ref_search = sb64_mv_ref_search;
-    ref_distance_weight = sb64_ref_distance_weight;
-  } else if (mbmi->sb_type == BLOCK_SIZE_SB32X32) {
-    mv_ref_search = sb_mv_ref_search;
-    ref_distance_weight = sb_ref_distance_weight;
-  } else {
-    mv_ref_search = mb_mv_ref_search;
-    ref_distance_weight = mb_ref_distance_weight;
+  mv_ref_search = mv_ref_blocks[mbmi->sb_type];
+  if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
+    x_idx = block_idx & 1;
+    y_idx = block_idx >> 1;
   }
 
   // We first scan for candidate vectors that match the current reference frame
   // Look at nearest neigbours
   for (i = 0; i < 2; ++i) {
-    const int mb_search_col = mb_col + mv_ref_search[i][0];
+    const int mi_search_col = mi_col + mv_ref_search[i][0];
+    const int mi_search_row = mi_row + mv_ref_search[i][1];
+    if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
+        (mi_search_col < cm->cur_tile_mi_col_end) &&
+        (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) {
+      int b;
 
-    if ((mb_search_col >= cm->cur_tile_mb_col_start) &&
-        (mb_search_col < cm->cur_tile_mb_col_end) &&
-        ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
-
       candidate_mi = here + mv_ref_search[i][0] +
                      (mv_ref_search[i][1] * xd->mode_info_stride);
 
-      if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) {
-        add_candidate_mv(candidate_mvs, candidate_scores,
-                         &refmv_count, c_refmv, ref_distance_weight[i] + 16);
+      if (block_idx >= 0) {
+        if (mv_ref_search[i][0])
+          b = 1 + y_idx * 2;
+        else
+          b = 2 + x_idx;
+      } else {
+        b = -1;
       }
-      split_count += (candidate_mi->mbmi.mode == SPLITMV);
+      if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, b)) {
+        add_candidate_mv(mv_ref_list, candidate_scores,
+                         &refmv_count, c_refmv, 16);
+      }
+      split_count += (candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 &&
+                      candidate_mi->mbmi.ref_frame[0] != INTRA_FRAME);
+
+      // Count number of neihgbours coded intra and zeromv
+      intra_count += (candidate_mi->mbmi.mode < NEARESTMV);
+      zero_count += (candidate_mi->mbmi.mode == ZEROMV);
+      newmv_count += (candidate_mi->mbmi.mode >= NEWMV);
     }
   }
-  // Look in the last frame if it exists
-  if (lf_here) {
-    candidate_mi = lf_here;
-    if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) {
-      add_candidate_mv(candidate_mvs, candidate_scores,
-                       &refmv_count, c_refmv, 18);
-    }
-  }
+
   // More distant neigbours
   for (i = 2; (i < MVREF_NEIGHBOURS) &&
-              (refmv_count < (MAX_MV_REF_CANDIDATES - 1)); ++i) {
-    const int mb_search_col = mb_col + mv_ref_search[i][0];
-
-    if ((mb_search_col >= cm->cur_tile_mb_col_start) &&
-        (mb_search_col < cm->cur_tile_mb_col_end) &&
-        ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
+              (refmv_count < MAX_MV_REF_CANDIDATES); ++i) {
+    const int mi_search_col = mi_col + mv_ref_search[i][0];
+    const int mi_search_row = mi_row + mv_ref_search[i][1];
+    if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
+        (mi_search_col < cm->cur_tile_mi_col_end) &&
+        (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) {
       candidate_mi = here + mv_ref_search[i][0] +
                      (mv_ref_search[i][1] * xd->mode_info_stride);
 
-      if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) {
-        add_candidate_mv(candidate_mvs, candidate_scores,
-                         &refmv_count, c_refmv, ref_distance_weight[i] + 16);
+      if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, -1)) {
+        add_candidate_mv(mv_ref_list, candidate_scores,
+                         &refmv_count, c_refmv, 16);
       }
     }
   }
 
+  // Look in the last frame if it exists
+  if (lf_here && (refmv_count < MAX_MV_REF_CANDIDATES)) {
+    candidate_mi = lf_here;
+    if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, -1)) {
+      add_candidate_mv(mv_ref_list, candidate_scores,
+                       &refmv_count, c_refmv, 16);
+    }
+  }
+
   // If we have not found enough candidates consider ones where the
   // reference frame does not match. Break out when we have
   // MAX_MV_REF_CANDIDATES candidates.
   // Look first at spatial neighbours
-  if (refmv_count < (MAX_MV_REF_CANDIDATES - 1)) {
-    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
-      const int mb_search_col = mb_col + mv_ref_search[i][0];
+  for (i = 0; (i < MVREF_NEIGHBOURS) &&
+              (refmv_count < MAX_MV_REF_CANDIDATES); ++i) {
+    const int mi_search_col = mi_col + mv_ref_search[i][0];
+    const int mi_search_row = mi_row + mv_ref_search[i][1];
+    if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
+        (mi_search_col < cm->cur_tile_mi_col_end) &&
+        (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) {
+      candidate_mi = here + mv_ref_search[i][0] +
+                     (mv_ref_search[i][1] * xd->mode_info_stride);
 
-      if ((mb_search_col >= cm->cur_tile_mb_col_start) &&
-          (mb_search_col < cm->cur_tile_mb_col_end) &&
-          ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
+      get_non_matching_candidates(candidate_mi, ref_frame,
+                                  &c_ref_frame, &c_refmv,
+                                  &c2_ref_frame, &c2_refmv);
 
-        candidate_mi = here + mv_ref_search[i][0] +
-                       (mv_ref_search[i][1] * xd->mode_info_stride);
-
-        get_non_matching_candidates(candidate_mi, ref_frame,
-                                    &c_ref_frame, &c_refmv,
-                                    &c2_ref_frame, &c2_refmv);
-
-        if (c_ref_frame != INTRA_FRAME) {
-          scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);
-          add_candidate_mv(candidate_mvs, candidate_scores,
-                           &refmv_count, c_refmv, ref_distance_weight[i]);
-        }
-
-        if (c2_ref_frame != INTRA_FRAME) {
-          scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);
-          add_candidate_mv(candidate_mvs, candidate_scores,
-                           &refmv_count, c2_refmv, ref_distance_weight[i]);
-        }
+      if (c_ref_frame != INTRA_FRAME) {
+        scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);
+        add_candidate_mv(mv_ref_list, candidate_scores,
+                         &refmv_count, c_refmv, 1);
       }
 
-      if (refmv_count >= (MAX_MV_REF_CANDIDATES - 1)) {
-        break;
+      if (c2_ref_frame != INTRA_FRAME) {
+        scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);
+        add_candidate_mv(mv_ref_list, candidate_scores,
+                         &refmv_count, c2_refmv, 1);
       }
     }
   }
+
   // Look at the last frame if it exists
-  if (refmv_count < (MAX_MV_REF_CANDIDATES - 1) && lf_here) {
+  if (lf_here && (refmv_count < MAX_MV_REF_CANDIDATES)) {
     candidate_mi = lf_here;
     get_non_matching_candidates(candidate_mi, ref_frame,
                                 &c_ref_frame, &c_refmv,
@@ -361,49 +271,36 @@
 
     if (c_ref_frame != INTRA_FRAME) {
       scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);
-      add_candidate_mv(candidate_mvs, candidate_scores,
-                       &refmv_count, c_refmv, 2);
+      add_candidate_mv(mv_ref_list, candidate_scores,
+                       &refmv_count, c_refmv, 1);
     }
 
     if (c2_ref_frame != INTRA_FRAME) {
       scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);
-      add_candidate_mv(candidate_mvs, candidate_scores,
-                       &refmv_count, c2_refmv, 2);
+      add_candidate_mv(mv_ref_list, candidate_scores,
+                       &refmv_count, c2_refmv, 1);
     }
   }
 
-  // Define inter mode coding context.
-  // 0,0 was best
-  if (candidate_mvs[0].as_int == 0) {
-    // 0,0 is only candidate
-    if (refmv_count <= 1) {
-      mbmi->mb_mode_context[ref_frame] = 0;
-    // non zero candidates candidates available
-    } else if (split_count == 0) {
-      mbmi->mb_mode_context[ref_frame] = 1;
+  if (!intra_count) {
+    if (!newmv_count) {
+      // 0 = both zero mv
+      // 1 = one zero mv + one a predicted mv
+      // 2 = two predicted mvs
+      mbmi->mb_mode_context[ref_frame] = 2 - zero_count;
     } else {
-      mbmi->mb_mode_context[ref_frame] = 2;
+      // 3 = one predicted/zero and one new mv
+      // 4 = two new mvs
+      mbmi->mb_mode_context[ref_frame] = 2 + newmv_count;
     }
-  } else if (split_count == 0) {
-    // Non zero best, No Split MV cases
-    mbmi->mb_mode_context[ref_frame] = candidate_scores[0] >= 16 ? 3 : 4;
   } else {
-    // Non zero best, some split mv
-    mbmi->mb_mode_context[ref_frame] = candidate_scores[0] >= 16 ? 5 : 6;
+    // 5 = one intra neighbour + x
+    // 6 = two intra neighbours
+    mbmi->mb_mode_context[ref_frame] = 4 + intra_count;
   }
 
-  // Scan for 0,0 case and clamp non zero choices
+  // Clamp vectors
   for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
-    if (candidate_mvs[i].as_int == 0) {
-      zero_seen = TRUE;
-    } else {
-      clamp_mv_ref(xd, &candidate_mvs[i]);
-    }
+    clamp_mv_ref(xd, &mv_ref_list[i]);
   }
-  // 0,0 is always a valid reference. Add it if not already seen.
-  if (!zero_seen)
-    candidate_mvs[MAX_MV_REF_CANDIDATES-1].as_int = 0;
-
-  // Copy over the candidate list.
-  vpx_memcpy(mv_ref_list, candidate_mvs, sizeof(candidate_mvs));
 }
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -14,12 +14,24 @@
 #ifndef VP9_COMMON_VP9_MVREF_COMMON_H_
 #define VP9_COMMON_VP9_MVREF_COMMON_H_
 
-void vp9_find_mv_refs(VP9_COMMON *cm,
-                      MACROBLOCKD *xd,
-                      MODE_INFO *here,
-                      MODE_INFO *lf_here,
-                      MV_REFERENCE_FRAME ref_frame,
-                      int_mv *mv_ref_list,
-                      int *ref_sign_bias);
+void vp9_find_mv_refs_idx(VP9_COMMON *cm,
+                          MACROBLOCKD *xd,
+                          MODE_INFO *here,
+                          MODE_INFO *lf_here,
+                          MV_REFERENCE_FRAME ref_frame,
+                          int_mv *mv_ref_list,
+                          int *ref_sign_bias,
+                          int block_idx);
+
+static INLINE void vp9_find_mv_refs(VP9_COMMON *cm,
+                                    MACROBLOCKD *xd,
+                                    MODE_INFO *here,
+                                    MODE_INFO *lf_here,
+                                    MV_REFERENCE_FRAME ref_frame,
+                                    int_mv *mv_ref_list,
+                                    int *ref_sign_bias) {
+  vp9_find_mv_refs_idx(cm, xd, here, lf_here, ref_frame,
+                       mv_ref_list, ref_sign_bias, -1);
+}
 
 #endif  // VP9_COMMON_VP9_MVREF_COMMON_H_
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -21,6 +21,9 @@
 #include "vpx/vp8cx.h"
 #include "vpx_scale/yv12config.h"
 #include "vp9/common/vp9_ppflags.h"
+
+#define MAX_MB_SEGMENTS 8
+
   typedef int *VP9_PTR;
 
   /* Create/destroy static data structures. */
@@ -225,8 +228,9 @@
 
   int vp9_set_roimap(VP9_PTR comp, unsigned char *map,
                      unsigned int rows, unsigned int cols,
-                     int delta_q[4], int delta_lf[4],
-                     unsigned int threshold[4]);
+                     int delta_q[MAX_MB_SEGMENTS],
+                     int delta_lf[MAX_MB_SEGMENTS],
+                     unsigned int threshold[MAX_MB_SEGMENTS]);
 
   int vp9_set_active_map(VP9_PTR comp, unsigned char *map,
                          unsigned int rows, unsigned int cols);
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -18,28 +18,20 @@
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_quant_common.h"
+
 #if CONFIG_POSTPROC
 #include "vp9/common/vp9_postproc.h"
 #endif
 
-/*#ifdef PACKET_TESTING*/
-#include "vp9/common/vp9_header.h"
-/*#endif*/
-
 /* Create/destroy static data structures. */
 
-void vp9_initialize_common(void);
+// Define the number of candidate reference buffers.
+#define NUM_REF_FRAMES 8
+#define NUM_REF_FRAMES_LG2 3
 
-#define MINQ 0
+#define ALLOWED_REFS_PER_FRAME 3
 
-#define MAXQ 255
-#define QINDEX_BITS 8
-
-#define QINDEX_RANGE (MAXQ + 1)
-
-#define NUM_REF_FRAMES 3
-#define NUM_REF_FRAMES_LG2 2
-
 // 1 scratch frame for the new frame, 3 for scaled references on the encoder
 // TODO(jkoleszar): These 3 extra references could probably come from the
 // normal reference pool.
@@ -48,107 +40,72 @@
 #define NUM_FRAME_CONTEXTS_LG2 2
 #define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LG2)
 
-#define COMP_PRED_CONTEXTS   2
+#define MAX_LAG_BUFFERS 25
 
 typedef struct frame_contexts {
-  vp9_prob bmode_prob[VP9_NKF_BINTRAMODES - 1];
-  vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
-  vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1];
-  vp9_prob uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];
-  vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1];
-  vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
-  vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1];
+  vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES - 1];
+  vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
+  vp9_prob partition_prob[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS]
+                         [PARTITION_TYPES - 1];
 
-  vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES];
-  vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES];
-  vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES];
-  vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES];
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_prob nzc_probs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                        [NZC4X4_NODES];
-  vp9_prob nzc_probs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                        [NZC8X8_NODES];
-  vp9_prob nzc_probs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                          [NZC16X16_NODES];
-  vp9_prob nzc_probs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                          [NZC32X32_NODES];
-  vp9_prob nzc_pcat_probs[MAX_NZC_CONTEXTS]
-                         [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA];
-#endif
-
   nmv_context nmvc;
   nmv_context pre_nmvc;
-  vp9_prob pre_bmode_prob[VP9_NKF_BINTRAMODES - 1];
-  vp9_prob pre_ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
-  vp9_prob pre_sb_ymode_prob[VP9_I32X32_MODES - 1];
-  vp9_prob pre_uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];
-  vp9_prob pre_i8x8_mode_prob[VP9_I8X8_MODES - 1];
-  vp9_prob pre_sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
-  vp9_prob pre_mbsplit_prob[VP9_NUMMBSPLITS - 1];
-  unsigned int bmode_counts[VP9_NKF_BINTRAMODES];
-  unsigned int ymode_counts[VP9_YMODES];   /* interframe intra mode probs */
-  unsigned int sb_ymode_counts[VP9_I32X32_MODES];
-  unsigned int uv_mode_counts[VP9_YMODES][VP9_UV_MODES];
-  unsigned int i8x8_mode_counts[VP9_I8X8_MODES];   /* interframe intra probs */
-  unsigned int sub_mv_ref_counts[SUBMVREF_COUNT][VP9_SUBMVREFS];
-  unsigned int mbsplit_counts[VP9_NUMMBSPLITS];
+  /* interframe intra mode probs */
+  vp9_prob pre_y_mode_prob[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES - 1];
+  vp9_prob pre_uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
+  vp9_prob pre_partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
+  /* interframe intra mode probs */
+  unsigned int y_mode_counts[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES];
+  unsigned int uv_mode_counts[VP9_INTRA_MODES][VP9_INTRA_MODES];
+  unsigned int partition_counts[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 
-  vp9_coeff_probs pre_coef_probs_4x4[BLOCK_TYPES];
-  vp9_coeff_probs pre_coef_probs_8x8[BLOCK_TYPES];
-  vp9_coeff_probs pre_coef_probs_16x16[BLOCK_TYPES];
-  vp9_coeff_probs pre_coef_probs_32x32[BLOCK_TYPES];
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_prob pre_nzc_probs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                            [NZC4X4_NODES];
-  vp9_prob pre_nzc_probs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                            [NZC8X8_NODES];
-  vp9_prob pre_nzc_probs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                              [NZC16X16_NODES];
-  vp9_prob pre_nzc_probs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                              [NZC32X32_NODES];
-  vp9_prob pre_nzc_pcat_probs[MAX_NZC_CONTEXTS]
-                             [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA];
-#endif
-
-  vp9_coeff_count coef_counts_4x4[BLOCK_TYPES];
-  vp9_coeff_count coef_counts_8x8[BLOCK_TYPES];
-  vp9_coeff_count coef_counts_16x16[BLOCK_TYPES];
-  vp9_coeff_count coef_counts_32x32[BLOCK_TYPES];
+  vp9_coeff_probs_model coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES];
+  vp9_coeff_probs_model pre_coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES];
+  vp9_coeff_count_model coef_counts[TX_SIZE_MAX_SB][BLOCK_TYPES];
   unsigned int eob_branch_counts[TX_SIZE_MAX_SB][BLOCK_TYPES][REF_TYPES]
                                 [COEF_BANDS][PREV_COEF_CONTEXTS];
 
-#if CONFIG_CODE_NONZEROCOUNT
-  unsigned int nzc_counts_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                             [NZC4X4_TOKENS];
-  unsigned int nzc_counts_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                             [NZC8X8_TOKENS];
-  unsigned int nzc_counts_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                               [NZC16X16_TOKENS];
-  unsigned int nzc_counts_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                               [NZC32X32_TOKENS];
-  unsigned int nzc_pcat_counts[MAX_NZC_CONTEXTS]
-                              [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA][2];
-#endif
-
   nmv_context_counts NMVcount;
   vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
                                  [VP9_SWITCHABLE_FILTERS - 1];
-#if CONFIG_COMP_INTERINTRA_PRED
-  unsigned int interintra_counts[2];
-  vp9_prob interintra_prob;
-  vp9_prob pre_interintra_prob;
-#endif
+  vp9_prob pre_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
+      [VP9_SWITCHABLE_FILTERS - 1];
+  unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]
+                                      [VP9_SWITCHABLE_FILTERS];
 
-  int vp9_mode_contexts[INTER_MODE_CONTEXTS][4];
-  unsigned int mv_ref_ct[INTER_MODE_CONTEXTS][4][2];
+  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];
+  vp9_prob pre_inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];
+  unsigned int inter_mode_counts[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2];
+
+  vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
+  vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];
+  vp9_prob single_ref_prob[REF_CONTEXTS][2];
+  vp9_prob comp_ref_prob[REF_CONTEXTS];
+  vp9_prob pre_intra_inter_prob[INTRA_INTER_CONTEXTS];
+  vp9_prob pre_comp_inter_prob[COMP_INTER_CONTEXTS];
+  vp9_prob pre_single_ref_prob[REF_CONTEXTS][2];
+  vp9_prob pre_comp_ref_prob[REF_CONTEXTS];
+  unsigned int intra_inter_count[INTRA_INTER_CONTEXTS][2];
+  unsigned int comp_inter_count[COMP_INTER_CONTEXTS][2];
+  unsigned int single_ref_count[REF_CONTEXTS][2][2];
+  unsigned int comp_ref_count[REF_CONTEXTS][2];
+
+  vp9_prob tx_probs_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
+  vp9_prob tx_probs_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
+  vp9_prob tx_probs_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3];
+  vp9_prob pre_tx_probs_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
+  vp9_prob pre_tx_probs_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
+  vp9_prob pre_tx_probs_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3];
+  unsigned int tx_count_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB];
+  unsigned int tx_count_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
+  unsigned int tx_count_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
+
+  vp9_prob mbskip_probs[MBSKIP_CONTEXTS];
+  vp9_prob pre_mbskip_probs[MBSKIP_CONTEXTS];
+  unsigned int mbskip_count[MBSKIP_CONTEXTS][2];
 } FRAME_CONTEXT;
 
 typedef enum {
-  RECON_CLAMP_REQUIRED        = 0,
-  RECON_CLAMP_NOTREQUIRED     = 1
-} CLAMP_TYPE;
-
-typedef enum {
   SINGLE_PREDICTION_ONLY = 0,
   COMP_PREDICTION_ONLY   = 1,
   HYBRID_PREDICTION      = 2,
@@ -167,8 +124,11 @@
 typedef struct VP9Common {
   struct vpx_internal_error_info  error;
 
-  DECLARE_ALIGNED(16, int16_t, Y1dequant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, int16_t, UVdequant[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][2]);
+  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][2]);
+#if CONFIG_ALPHA
+  DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][2]);
+#endif
 
   int width;
   int height;
@@ -177,8 +137,13 @@
   int last_width;
   int last_height;
 
+  // TODO(jkoleszar): this implies chroma ss right now, but could vary per
+  // plane. Revisit as part of the future change to YV12_BUFFER_CONFIG to
+  // support additional planes.
+  int subsampling_x;
+  int subsampling_y;
+
   YUV_TYPE clr_type;
-  CLAMP_TYPE  clamp_type;
 
   YV12_BUFFER_CONFIG *frame_to_show;
 
@@ -186,13 +151,15 @@
   int fb_idx_ref_cnt[NUM_YV12_BUFFERS]; /* reference counts */
   int ref_frame_map[NUM_REF_FRAMES]; /* maps fb_idx to reference slot */
 
-  /* TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
-   * roll new_fb_idx into it.
-   */
-  int active_ref_idx[3]; /* each frame can reference 3 buffers */
+  // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
+  // roll new_fb_idx into it.
+
+  // Each frame can reference ALLOWED_REFS_PER_FRAME buffers
+  int active_ref_idx[ALLOWED_REFS_PER_FRAME];
+  struct scale_factors active_ref_scale[ALLOWED_REFS_PER_FRAME];
   int new_fb_idx;
-  struct scale_factors active_ref_scale[3];
 
+
   YV12_BUFFER_CONFIG post_proc_buffer;
   YV12_BUFFER_CONFIG temp_scale_frame;
 
@@ -201,28 +168,37 @@
   FRAME_TYPE frame_type;
 
   int show_frame;
+  int last_show_frame;
 
+  // Flag signaling that the frame is encoded using only INTRA modes.
+  int intra_only;
+
+  // Flag signaling that the frame context should be reset to default values.
+  // 0 or 1 implies don't reset, 2 reset just the context specified in the
+  // frame header, 3 reset all contexts.
+  int reset_frame_context;
+
   int frame_flags;
+  // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in
+  // MODE_INFO (8-pixel) units.
   int MBs;
-  int mb_rows;
-  int mb_cols;
+  int mb_rows, mi_rows;
+  int mb_cols, mi_cols;
   int mode_info_stride;
 
   /* profile settings */
-  int experimental;
-  int mb_no_coeff_skip;
   TXFM_MODE txfm_mode;
-  COMPPREDMODE_TYPE comp_pred_mode;
-  int no_lpf;
-  int use_bilinear_mc_filter;
-  int full_pixel;
 
   int base_qindex;
   int last_kf_gf_q;  /* Q used on the last GF or KF */
 
-  int y1dc_delta_q;
-  int uvdc_delta_q;
-  int uvac_delta_q;
+  int y_dc_delta_q;
+  int uv_dc_delta_q;
+  int uv_ac_delta_q;
+#if CONFIG_ALPHA
+  int a_dc_delta_q;
+  int a_ac_delta_q;
+#endif
 
   unsigned int frames_since_golden;
   unsigned int frames_till_alt_ref_frame;
@@ -240,7 +216,6 @@
   unsigned char *last_frame_seg_map;
 
   INTERPOLATIONFILTERTYPE mcomp_filter_type;
-  LOOPFILTERTYPE filter_type;
 
   loop_filter_info_n lf_info;
 
@@ -247,49 +222,36 @@
   int filter_level;
   int last_sharpness_level;
   int sharpness_level;
-  int dering_enabled;
 
-  int refresh_entropy_probs;    /* Two state 0 = NO, 1 = YES */
+  int refresh_frame_context;    /* Two state 0 = NO, 1 = YES */
 
   int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */
 
   /* Y,U,V */
-  ENTROPY_CONTEXT_PLANES *above_context;   /* row of context for each plane */
-  ENTROPY_CONTEXT_PLANES left_context[4];  /* (up to) 4 contexts "" */
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
 
+  // partition contexts
+  PARTITION_CONTEXT *above_seg_context;
+  PARTITION_CONTEXT left_seg_context[8];
+
   /* keyframe block modes are predicted by their above, left neighbors */
 
-  vp9_prob kf_bmode_prob[VP9_KF_BINTRAMODES]
-                        [VP9_KF_BINTRAMODES]
-                        [VP9_KF_BINTRAMODES - 1];
-  vp9_prob kf_ymode_prob[8][VP9_YMODES - 1]; /* keyframe "" */
-  vp9_prob sb_kf_ymode_prob[8][VP9_I32X32_MODES - 1];
-  int kf_ymode_probs_index;
-  int kf_ymode_probs_update;
-  vp9_prob kf_uv_mode_prob[VP9_YMODES] [VP9_UV_MODES - 1];
+  vp9_prob kf_y_mode_prob[VP9_INTRA_MODES]
+                         [VP9_INTRA_MODES]
+                         [VP9_INTRA_MODES - 1];
+  vp9_prob kf_uv_mode_prob[VP9_INTRA_MODES] [VP9_INTRA_MODES - 1];
 
-  vp9_prob prob_intra_coded;
-  vp9_prob prob_last_coded;
-  vp9_prob prob_gf_coded;
-  vp9_prob sb32_coded;
-  vp9_prob sb64_coded;
-
   // Context probabilities when using predictive coding of segment id
   vp9_prob segment_pred_probs[PREDICTION_PROBS];
   unsigned char temporal_update;
 
   // Context probabilities for reference frame prediction
-  unsigned char ref_scores[MAX_REF_FRAMES];
-  vp9_prob ref_pred_probs[PREDICTION_PROBS];
-  vp9_prob mod_refprobs[MAX_REF_FRAMES][PREDICTION_PROBS];
+  int allow_comp_inter_inter;
+  MV_REFERENCE_FRAME comp_fixed_ref;
+  MV_REFERENCE_FRAME comp_var_ref[2];
+  COMPPREDMODE_TYPE comp_pred_mode;
 
-  vp9_prob prob_comppred[COMP_PRED_CONTEXTS];
-
-  // FIXME contextualize
-  vp9_prob prob_tx[TX_SIZE_MAX_SB - 1];
-
-  vp9_prob mbskip_pred_probs[MBSKIP_CONTEXTS];
-
   FRAME_CONTEXT fc;  /* this frame entropy */
   FRAME_CONTEXT frame_contexts[NUM_FRAME_CONTEXTS];
   unsigned int  frame_context_idx; /* Context to use/update */
@@ -298,9 +260,6 @@
   int near_boffset[3];
   int version;
 
-#ifdef PACKET_TESTING
-  VP9_HEADER oh;
-#endif
   double bitrate;
   double framerate;
 
@@ -308,17 +267,13 @@
   struct postproc_state  postproc_state;
 #endif
 
-#if CONFIG_COMP_INTERINTRA_PRED
-  int use_interintra;
-#endif
-
   int error_resilient_mode;
   int frame_parallel_decoding_mode;
 
   int tile_columns, log2_tile_columns;
-  int cur_tile_mb_col_start, cur_tile_mb_col_end, cur_tile_col_idx;
+  int cur_tile_mi_col_start, cur_tile_mi_col_end, cur_tile_col_idx;
   int tile_rows, log2_tile_rows;
-  int cur_tile_mb_row_start, cur_tile_mb_row_end, cur_tile_row_idx;
+  int cur_tile_mi_row_start, cur_tile_mi_row_end, cur_tile_row_idx;
 } VP9_COMMON;
 
 static int get_free_fb(VP9_COMMON *cm) {
@@ -341,31 +296,76 @@
   buf[new_idx]++;
 }
 
-// TODO(debargha): merge the two functions
-static void set_mb_row(VP9_COMMON *cm, MACROBLOCKD *xd,
-                       int mb_row, int block_size) {
-  xd->mb_to_top_edge    = -((mb_row * 16) << 3);
-  xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3;
+static int mi_cols_aligned_to_sb(VP9_COMMON *cm) {
+  return 2 * ((cm->mb_cols + 3) & ~3);
+}
 
-  // Are edges available for intra prediction?
-  xd->up_available    = (mb_row != 0);
+static INLINE void set_partition_seg_context(VP9_COMMON *cm,
+                                             MACROBLOCKD *xd,
+                                             int mi_row, int mi_col) {
+  xd->above_seg_context = cm->above_seg_context + mi_col;
+  xd->left_seg_context  = cm->left_seg_context + (mi_row & MI_MASK);
 }
 
-static void set_mb_col(VP9_COMMON *cm, MACROBLOCKD *xd,
-                       int mb_col, int block_size) {
-  xd->mb_to_left_edge   = -((mb_col * 16) << 3);
-  xd->mb_to_right_edge  = ((cm->mb_cols - block_size - mb_col) * 16) << 3;
+static int check_bsize_coverage(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                int mi_row, int mi_col,
+                                BLOCK_SIZE_TYPE bsize) {
+  int bsl = mi_width_log2(bsize), bs = 1 << bsl;
+  int ms = bs / 2;
 
+  if ((mi_row + ms < cm->mi_rows) && (mi_col + ms < cm->mi_cols))
+    return 0;
+
+  // frame width/height are multiples of 8, hence 8x8 block should always
+  // pass the above check
+  assert(bsize > BLOCK_SIZE_SB8X8);
+
+  // return the node index in the prob tree for binary coding
+  // skip horizontal/none partition types
+  if ((mi_col + ms < cm->mi_cols) && (mi_row + ms >= cm->mi_rows))
+    return 1;
+  // skip vertical/none partition types
+  if ((mi_row + ms < cm->mi_rows) && (mi_col + ms >= cm->mi_cols))
+    return 2;
+
+  return -1;
+}
+
+static void set_mi_row_col(VP9_COMMON *cm, MACROBLOCKD *xd,
+                       int mi_row, int bh,
+                       int mi_col, int bw) {
+  xd->mb_to_top_edge    = -((mi_row * MI_SIZE) << 3);
+  xd->mb_to_bottom_edge = ((cm->mi_rows - bh - mi_row) * MI_SIZE) << 3;
+  xd->mb_to_left_edge   = -((mi_col * MI_SIZE) << 3);
+  xd->mb_to_right_edge  = ((cm->mi_cols - bw - mi_col) * MI_SIZE) << 3;
+
   // Are edges available for intra prediction?
-  xd->left_available  = (mb_col > cm->cur_tile_mb_col_start);
-  xd->right_available = (mb_col + block_size < cm->cur_tile_mb_col_end);
+  xd->up_available    = (mi_row != 0);
+  xd->left_available  = (mi_col > cm->cur_tile_mi_col_start);
+  xd->right_available = (mi_col + bw < cm->cur_tile_mi_col_end);
 }
 
-static int get_mb_row(const MACROBLOCKD *xd) {
-  return ((-xd->mb_to_top_edge) >> 7);
+static int get_mi_row(const MACROBLOCKD *xd) {
+  return ((-xd->mb_to_top_edge) >> (3 + LOG2_MI_SIZE));
 }
 
-static int get_mb_col(const MACROBLOCKD *xd) {
-  return ((-xd->mb_to_left_edge) >> 7);
+static int get_mi_col(const MACROBLOCKD *xd) {
+  return ((-xd->mb_to_left_edge) >> (3 + LOG2_MI_SIZE));
+}
+
+static int get_token_alloc(int mb_rows, int mb_cols) {
+  return mb_rows * mb_cols * (48 * 16 + 4);
+}
+
+static void set_prev_mi(VP9_COMMON *cm) {
+  const int use_prev_in_find_mv_refs = cm->width == cm->last_width &&
+                                       cm->height == cm->last_height &&
+                                       !cm->error_resilient_mode &&
+                                       !cm->intra_only &&
+                                       cm->last_show_frame;
+  // Special case: set prev_mi to NULL when the previous mode info
+  // context cannot be used.
+  cm->prev_mi = use_prev_in_find_mv_refs ?
+                  cm->prev_mip + cm->mode_info_stride + 1 : NULL;
 }
 #endif  // VP9_COMMON_VP9_ONYXC_INT_H_
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -53,7 +53,7 @@
   { RGB_TO_YUV(0xCC33FF) },   /* Magenta */
 };
 
-static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] = {
+static const unsigned char B_PREDICTION_MODE_colors[VP9_INTRA_MODES][3] = {
   { RGB_TO_YUV(0x6633ff) },   /* Purple */
   { RGB_TO_YUV(0xcc33ff) },   /* Magenta */
   { RGB_TO_YUV(0xff33cc) },   /* Pink */
@@ -132,7 +132,7 @@
 
 /****************************************************************************
  */
-void vp9_post_proc_down_and_across_c(uint8_t *src_ptr,
+void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
                                      uint8_t *dst_ptr,
                                      int src_pixels_per_line,
                                      int dst_pixels_per_line,
@@ -139,7 +139,8 @@
                                      int rows,
                                      int cols,
                                      int flimit) {
-  uint8_t *p_src, *p_dst;
+  uint8_t const *p_src;
+  uint8_t *p_dst;
   int row;
   int col;
   int i;
@@ -313,51 +314,64 @@
                                 source->uv_height, source->uv_width, ppl);
 }
 
-void vp9_deblock(YV12_BUFFER_CONFIG         *source,
-                 YV12_BUFFER_CONFIG         *post,
-                 int                         q,
-                 int                         low_var_thresh,
-                 int                         flag) {
-  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
-  int ppl = (int)(level + .5);
-  (void) low_var_thresh;
-  (void) flag;
+void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+                 int q) {
+  const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q
+                        + 0.0065 + 0.5);
+  int i;
 
-  vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer,
-                                source->y_stride, post->y_stride,
-                                source->y_height, source->y_width, ppl);
+  const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                                  src->alpha_buffer};
+  const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                              src->alpha_stride};
+  const int src_widths[4] = {src->y_width, src->uv_width, src->uv_width,
+                             src->alpha_width};
+  const int src_heights[4] = {src->y_height, src->uv_height, src->uv_height,
+                              src->alpha_height};
 
-  vp9_post_proc_down_and_across(source->u_buffer, post->u_buffer,
-                                source->uv_stride, post->uv_stride,
-                                source->uv_height, source->uv_width, ppl);
+  uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer,
+                            dst->alpha_buffer};
+  const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride,
+                              dst->alpha_stride};
 
-  vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer,
-                                source->uv_stride, post->uv_stride,
-                                source->uv_height, source->uv_width, ppl);
+  for (i = 0; i < MAX_MB_PLANE; ++i)
+    vp9_post_proc_down_and_across(srcs[i], dsts[i],
+                                  src_strides[i], dst_strides[i],
+                                  src_heights[i], src_widths[i], ppl);
 }
 
-void vp9_denoise(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *post,
-                 int q, int low_var_thresh, int flag) {
-  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
-  int ppl = (int)(level + .5);
-  (void) post;
-  (void) low_var_thresh;
-  (void) flag;
+void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+                 int q) {
+  const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q
+                        + 0.0065 + 0.5);
+  int i;
 
-  vp9_post_proc_down_and_across(src->y_buffer + 2 * src->y_stride + 2,
-                                src->y_buffer + 2 * src->y_stride + 2,
-                                src->y_stride, src->y_stride, src->y_height - 4,
-                                src->y_width - 4, ppl);
+  const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                                  src->alpha_buffer};
+  const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                              src->alpha_stride};
+  const int src_widths[4] = {src->y_width, src->uv_width, src->uv_width,
+                             src->alpha_width};
+  const int src_heights[4] = {src->y_height, src->uv_height, src->uv_height,
+                              src->alpha_height};
 
-  vp9_post_proc_down_and_across(src->u_buffer + 2 * src->uv_stride + 2,
-                                src->u_buffer + 2 * src->uv_stride + 2,
-                                src->uv_stride, src->uv_stride,
-                                src->uv_height - 4, src->uv_width - 4, ppl);
+  uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer,
+                            dst->alpha_buffer};
+  const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride,
+                              dst->alpha_stride};
 
-  vp9_post_proc_down_and_across(src->v_buffer + 2 * src->uv_stride + 2,
-                                src->v_buffer + 2 * src->uv_stride + 2,
-                                src->uv_stride, src->uv_stride,
-                                src->uv_height - 4, src->uv_width - 4, ppl);
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    const int src_stride = src_strides[i];
+    const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
+    const int src_width = src_widths[i] - 4;
+    const int src_height = src_heights[i] - 4;
+
+    const int dst_stride = dst_strides[i];
+    uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
+
+    vp9_post_proc_down_and_across(src, dst, src_stride, dst_stride,
+                                  src_height, src_width, ppl);
+  }
 }
 
 double vp9_gaussian(double sigma, double mu, double x) {
@@ -631,13 +645,7 @@
 
   if (!flags) {
     *dest = *oci->frame_to_show;
-
-    /* handle problem with extending borders */
-    dest->y_width = oci->width;
-    dest->y_height = oci->height;
-    dest->uv_height = dest->y_height / 2;
     return 0;
-
   }
 
 #if ARCH_X86||ARCH_X86_64
@@ -648,7 +656,7 @@
     deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
                                q + (deblock_level - 5) * 10, 1, 0);
   } else if (flags & VP9D_DEBLOCK) {
-    vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer, q, 1, 0);
+    vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer, q);
   } else {
     vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer);
   }
@@ -727,7 +735,7 @@
     for (i = 0; i < mb_rows; i++) {
       for (j = 0; j < mb_cols; j++) {
         char zz[4];
-        int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED &&
+        int dc_diff = !(mi[mb_index].mbmi.mode != I4X4_PRED &&
                         mi[mb_index].mbmi.mode != SPLITMV &&
                         mi[mb_index].mbmi.mb_skip_coeff);
 
@@ -913,8 +921,8 @@
       for (x = 0; x < width; x += 16) {
         int Y = 0, U = 0, V = 0;
 
-        if (mi->mbmi.mode == B_PRED &&
-            ((ppflags->display_mb_modes_flag & B_PRED) ||
+        if (mi->mbmi.mode == I4X4_PRED &&
+            ((ppflags->display_mb_modes_flag & I4X4_PRED) ||
              ppflags->display_b_modes_flag)) {
           int by, bx;
           uint8_t *yl, *ul, *vl;
@@ -927,7 +935,7 @@
           for (by = 0; by < 16; by += 4) {
             for (bx = 0; bx < 16; bx += 4) {
               if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode))
-                  || (ppflags->display_mb_modes_flag & B_PRED)) {
+                  || (ppflags->display_mb_modes_flag & I4X4_PRED)) {
                 Y = B_PREDICTION_MODE_colors[bmi->as_mode.first][0];
                 U = B_PREDICTION_MODE_colors[bmi->as_mode.first][1];
                 V = B_PREDICTION_MODE_colors[bmi->as_mode.first][2];
--- a/vp9/common/vp9_postproc.h
+++ b/vp9/common/vp9_postproc.h
@@ -29,10 +29,8 @@
 int vp9_post_proc_frame(struct VP9Common *oci, YV12_BUFFER_CONFIG *dest,
                         vp9_ppflags_t *flags);
 
-void vp9_denoise(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post,
-                 int q, int low_var_thresh, int flag);
+void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
 
-void vp9_deblock(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post,
-                 int q, int low_var_thresh, int flag);
+void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
 
 #endif  // VP9_COMMON_VP9_POSTPROC_H_
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -9,6 +9,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <limits.h>
+
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_seg_common.h"
@@ -21,8 +23,11 @@
                                    const MACROBLOCKD *const xd,
                                    PRED_ID pred_id) {
   int pred_context;
-  MODE_INFO *m = xd->mode_info_context;
-
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const MODE_INFO *const above_mi = mi - cm->mode_info_stride;
+  const MODE_INFO *const left_mi = mi - 1;
+  const int left_in_image = xd->left_available && left_mi->mbmi.mb_in_image;
+  const int above_in_image = xd->up_available && above_mi->mbmi.mb_in_image;
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
@@ -29,77 +34,352 @@
   // The prediction flags in these dummy entries are initialised to 0.
   switch (pred_id) {
     case PRED_SEG_ID:
-      pred_context = (m - cm->mode_info_stride)->mbmi.seg_id_predicted;
+      pred_context = above_mi->mbmi.seg_id_predicted;
       if (xd->left_available)
-        pred_context += (m - 1)->mbmi.seg_id_predicted;
+        pred_context += left_mi->mbmi.seg_id_predicted;
       break;
 
-    case PRED_REF:
-      pred_context = (m - cm->mode_info_stride)->mbmi.ref_predicted;
+    case PRED_MBSKIP:
+      pred_context = above_mi->mbmi.mb_skip_coeff;
       if (xd->left_available)
-        pred_context += (m - 1)->mbmi.ref_predicted;
+        pred_context += left_mi->mbmi.mb_skip_coeff;
       break;
 
-    case PRED_COMP:
-      // Context based on use of comp pred flag by neighbours
-      // pred_context =
-      //   ((m - 1)->mbmi.second_ref_frame > INTRA_FRAME) +
-      //    ((m - cm->mode_info_stride)->mbmi.second_ref_frame > INTRA_FRAME);
+    case PRED_SWITCHABLE_INTERP: {
+      // left
+      const int left_mv_pred = is_inter_mode(left_mi->mbmi.mode);
+      const int left_interp = left_in_image && left_mv_pred ?
+                    vp9_switchable_interp_map[left_mi->mbmi.interp_filter] :
+                    VP9_SWITCHABLE_FILTERS;
 
-      // Context based on mode and reference frame
-      // if ( m->mbmi.ref_frame == LAST_FRAME )
-      //    pred_context = 0 + (m->mbmi.mode != ZEROMV);
-      // else if ( m->mbmi.ref_frame == GOLDEN_FRAME )
-      //    pred_context = 2 + (m->mbmi.mode != ZEROMV);
-      // else
-      //    pred_context = 4 + (m->mbmi.mode != ZEROMV);
+      // above
+      const int above_mv_pred = is_inter_mode(above_mi->mbmi.mode);
+      const int above_interp = above_in_image && above_mv_pred ?
+                    vp9_switchable_interp_map[above_mi->mbmi.interp_filter] :
+                    VP9_SWITCHABLE_FILTERS;
 
-      if (m->mbmi.ref_frame == LAST_FRAME)
-        pred_context = 0;
+      assert(left_interp != -1);
+      assert(above_interp != -1);
+
+      if (left_interp == above_interp)
+        pred_context = left_interp;
+      else if (left_interp == VP9_SWITCHABLE_FILTERS &&
+               above_interp != VP9_SWITCHABLE_FILTERS)
+         pred_context = above_interp;
+      else if (left_interp != VP9_SWITCHABLE_FILTERS &&
+               above_interp == VP9_SWITCHABLE_FILTERS)
+        pred_context = left_interp;
       else
+        pred_context = VP9_SWITCHABLE_FILTERS;
+
+      break;
+    }
+
+    case PRED_INTRA_INTER: {
+      if (above_in_image && left_in_image) {  // both edges available
+        if (left_mi->mbmi.ref_frame[0] == INTRA_FRAME &&
+            above_mi->mbmi.ref_frame[0] == INTRA_FRAME) {  // intra/intra (3)
+          pred_context = 3;
+        } else {  // intra/inter (1) or inter/inter (0)
+          pred_context = left_mi->mbmi.ref_frame[0] == INTRA_FRAME ||
+                         above_mi->mbmi.ref_frame[0] == INTRA_FRAME;
+        }
+      } else if (above_in_image || left_in_image) {  // one edge available
+        const MODE_INFO *edge = above_in_image ? above_mi : left_mi;
+
+        // inter: 0, intra: 2
+        pred_context = 2 * (edge->mbmi.ref_frame[0] == INTRA_FRAME);
+      } else {
+        pred_context = 0;
+      }
+      assert(pred_context >= 0 && pred_context < INTRA_INTER_CONTEXTS);
+      break;
+    }
+
+    case PRED_COMP_INTER_INTER: {
+      if (above_in_image && left_in_image) {  // both edges available
+        if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME &&
+            left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) {
+          // neither edge uses comp pred (0/1)
+          pred_context = ((above_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref) ^
+                          (left_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref));
+        } else if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME) {
+          // one of two edges uses comp pred (2/3)
+          pred_context = 2 +
+              (above_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref ||
+               above_mi->mbmi.ref_frame[0] == INTRA_FRAME);
+        } else if (left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) {
+          // one of two edges uses comp pred (2/3)
+          pred_context = 2 +
+              (left_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref ||
+               left_mi->mbmi.ref_frame[0] == INTRA_FRAME);
+        } else {  // both edges use comp pred (4)
+          pred_context = 4;
+        }
+      } else if (above_in_image || left_in_image) {  // one edge available
+        const MODE_INFO *edge = above_in_image ? above_mi : left_mi;
+
+        if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {
+          // edge does not use comp pred (0/1)
+          pred_context = edge->mbmi.ref_frame[0] == cm->comp_fixed_ref;
+        } else {  // edge uses comp pred (3)
+          pred_context = 3;
+        }
+      } else {  // no edges available (1)
         pred_context = 1;
+      }
+      assert(pred_context >= 0 && pred_context < COMP_INTER_CONTEXTS);
+      break;
+    }
 
+    case PRED_COMP_REF_P: {
+      const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+      const int var_ref_idx = !fix_ref_idx;
+
+      if (above_in_image && left_in_image) {  // both edges available
+        if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME &&
+            left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {  // intra/intra (2)
+          pred_context = 2;
+        } else if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME ||
+                   left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {  // intra/inter
+          const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == INTRA_FRAME ?
+                                  left_mi : above_mi;
+
+          if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {  // single pred (1/3)
+            pred_context = 1 +
+                2 * edge->mbmi.ref_frame[0] != cm->comp_var_ref[1];
+          } else {  // comp pred (1/3)
+            pred_context = 1 +
+                2 * edge->mbmi.ref_frame[var_ref_idx] != cm->comp_var_ref[1];
+          }
+        } else {  // inter/inter
+          int l_sg = left_mi->mbmi.ref_frame[1] <= INTRA_FRAME;
+          int a_sg = above_mi->mbmi.ref_frame[1] <= INTRA_FRAME;
+          MV_REFERENCE_FRAME vrfa = a_sg ? above_mi->mbmi.ref_frame[0] :
+              above_mi->mbmi.ref_frame[var_ref_idx];
+          MV_REFERENCE_FRAME vrfl = l_sg ? left_mi->mbmi.ref_frame[0] :
+              left_mi->mbmi.ref_frame[var_ref_idx];
+
+          if (vrfa == vrfl && cm->comp_var_ref[1] == vrfa) {
+            pred_context = 0;
+          } else if (l_sg && a_sg) {  // single/single
+            if ((vrfa == cm->comp_fixed_ref && vrfl == cm->comp_var_ref[0]) ||
+                (vrfl == cm->comp_fixed_ref && vrfa == cm->comp_var_ref[0])) {
+              pred_context = 4;
+            } else if (vrfa == vrfl) {
+              pred_context = 3;
+            } else {
+              pred_context = 1;
+            }
+          } else if (l_sg || a_sg) {  // single/comp
+            MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl;
+            MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl;
+
+            if (vrfc == cm->comp_var_ref[1] && rfs != cm->comp_var_ref[1]) {
+              pred_context = 1;
+            } else if (rfs == cm->comp_var_ref[1] &&
+                       vrfc != cm->comp_var_ref[1]) {
+              pred_context = 2;
+            } else {
+              pred_context = 4;
+            }
+          } else if (vrfa == vrfl) {  // comp/comp
+            pred_context = 4;
+          } else {
+            pred_context = 2;
+          }
+        }
+      } else if (above_in_image || left_in_image) {  // one edge available
+        const MODE_INFO *edge = above_in_image ? above_mi : left_mi;
+
+        if (edge->mbmi.ref_frame[0] == INTRA_FRAME) {
+          pred_context = 2;
+        } else if (edge->mbmi.ref_frame[1] > INTRA_FRAME) {
+          pred_context =
+              4 * edge->mbmi.ref_frame[var_ref_idx] != cm->comp_var_ref[1];
+        } else {
+          pred_context = 3 * edge->mbmi.ref_frame[0] != cm->comp_var_ref[1];
+        }
+      } else {  // no edges available (2)
+        pred_context = 2;
+      }
+      assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
       break;
+    }
 
-    case PRED_MBSKIP:
-      pred_context = (m - cm->mode_info_stride)->mbmi.mb_skip_coeff;
-      if (xd->left_available)
-        pred_context += (m - 1)->mbmi.mb_skip_coeff;
+    case PRED_SINGLE_REF_P1: {
+      if (above_in_image && left_in_image) {  // both edges available
+        if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME &&
+            left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
+          pred_context = 2;
+        } else if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME ||
+                   left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
+          const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == INTRA_FRAME ?
+                                  left_mi : above_mi;
+
+          if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {
+            pred_context = 4 * (edge->mbmi.ref_frame[0] == LAST_FRAME);
+          } else {
+            pred_context = 1 + (edge->mbmi.ref_frame[0] == LAST_FRAME ||
+                                edge->mbmi.ref_frame[1] == LAST_FRAME);
+          }
+        } else if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME &&
+                   left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) {
+          pred_context = 2 * (above_mi->mbmi.ref_frame[0] == LAST_FRAME) +
+                         2 * (left_mi->mbmi.ref_frame[0] == LAST_FRAME);
+        } else if (above_mi->mbmi.ref_frame[1] > INTRA_FRAME &&
+                   left_mi->mbmi.ref_frame[1] > INTRA_FRAME) {
+          pred_context = 1 + (above_mi->mbmi.ref_frame[0] == LAST_FRAME ||
+                              above_mi->mbmi.ref_frame[1] == LAST_FRAME ||
+                              left_mi->mbmi.ref_frame[0] == LAST_FRAME ||
+                              left_mi->mbmi.ref_frame[1] == LAST_FRAME);
+        } else {
+          MV_REFERENCE_FRAME rfs = above_mi->mbmi.ref_frame[1] <= INTRA_FRAME ?
+              above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0];
+          MV_REFERENCE_FRAME crf1 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ?
+              above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0];
+          MV_REFERENCE_FRAME crf2 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ?
+              above_mi->mbmi.ref_frame[1] : left_mi->mbmi.ref_frame[1];
+
+          if (rfs == LAST_FRAME) {
+            pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+          } else {
+            pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME;
+          }
+        }
+      } else if (above_in_image || left_in_image) {  // one edge available
+        const MODE_INFO *edge = above_in_image ? above_mi : left_mi;
+
+        if (edge->mbmi.ref_frame[0] == INTRA_FRAME) {
+          pred_context = 2;
+        } else if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {
+          pred_context = 4 * (edge->mbmi.ref_frame[0] == LAST_FRAME);
+        } else {
+          pred_context = 1 + (edge->mbmi.ref_frame[0] == LAST_FRAME ||
+                              edge->mbmi.ref_frame[1] == LAST_FRAME);
+        }
+      } else {  // no edges available (2)
+        pred_context = 2;
+      }
+      assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
       break;
+    }
 
-    case PRED_SWITCHABLE_INTERP:
-      {
-        int left_in_image = xd->left_available && (m - 1)->mbmi.mb_in_image;
-        int above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;
-        int left_mode = (m - 1)->mbmi.mode;
-        int above_mode = (m - cm->mode_info_stride)->mbmi.mode;
-        int left_interp, above_interp;
-        if (left_in_image && left_mode >= NEARESTMV && left_mode <= SPLITMV)
-          left_interp = vp9_switchable_interp_map[(m - 1)->mbmi.interp_filter];
-        else
-          left_interp = VP9_SWITCHABLE_FILTERS;
-        assert(left_interp != -1);
-        if (above_in_image && above_mode >= NEARESTMV && above_mode <= SPLITMV)
-          above_interp = vp9_switchable_interp_map[
-              (m - cm->mode_info_stride)->mbmi.interp_filter];
-        else
-          above_interp = VP9_SWITCHABLE_FILTERS;
-        assert(above_interp != -1);
+    case PRED_SINGLE_REF_P2: {
+      if (above_in_image && left_in_image) {  // both edges available
+        if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME &&
+            left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
+          pred_context = 2;
+        } else if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME ||
+                   left_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
+          const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == INTRA_FRAME ?
+                                  left_mi : above_mi;
 
-        if (left_interp == above_interp)
-          pred_context = left_interp;
-        else if (left_interp == VP9_SWITCHABLE_FILTERS &&
-                 above_interp != VP9_SWITCHABLE_FILTERS)
-          pred_context = above_interp;
-        else if (left_interp != VP9_SWITCHABLE_FILTERS &&
-                 above_interp == VP9_SWITCHABLE_FILTERS)
-          pred_context = left_interp;
-        else
-          pred_context = VP9_SWITCHABLE_FILTERS;
+          if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {
+            if (edge->mbmi.ref_frame[0] == LAST_FRAME) {
+              pred_context = 3;
+            } else {
+              pred_context = 4 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME);
+            }
+          } else {
+            pred_context = 1 + 2 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME ||
+                                    edge->mbmi.ref_frame[1] == GOLDEN_FRAME);
+          }
+        } else if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME &&
+                   left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) {
+          if (above_mi->mbmi.ref_frame[0] == LAST_FRAME &&
+              left_mi->mbmi.ref_frame[0] == LAST_FRAME) {
+            pred_context = 3;
+          } else if (above_mi->mbmi.ref_frame[0] == LAST_FRAME ||
+                     left_mi->mbmi.ref_frame[0] == LAST_FRAME) {
+            const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == LAST_FRAME ?
+                                    left_mi : above_mi;
+
+            pred_context = 4 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME);
+          } else {
+            pred_context = 2 * (above_mi->mbmi.ref_frame[0] == GOLDEN_FRAME) +
+                           2 * (left_mi->mbmi.ref_frame[0] == GOLDEN_FRAME);
+          }
+        } else if (above_mi->mbmi.ref_frame[1] > INTRA_FRAME &&
+                   left_mi->mbmi.ref_frame[1] > INTRA_FRAME) {
+          if (above_mi->mbmi.ref_frame[0] == left_mi->mbmi.ref_frame[0] &&
+              above_mi->mbmi.ref_frame[1] == left_mi->mbmi.ref_frame[1]) {
+            pred_context = 3 * (above_mi->mbmi.ref_frame[0] == GOLDEN_FRAME ||
+                                above_mi->mbmi.ref_frame[1] == GOLDEN_FRAME ||
+                                left_mi->mbmi.ref_frame[0] == GOLDEN_FRAME ||
+                                left_mi->mbmi.ref_frame[1] == GOLDEN_FRAME);
+          } else {
+            pred_context = 2;
+          }
+        } else {
+          MV_REFERENCE_FRAME rfs = above_mi->mbmi.ref_frame[1] <= INTRA_FRAME ?
+              above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0];
+          MV_REFERENCE_FRAME crf1 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ?
+              above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0];
+          MV_REFERENCE_FRAME crf2 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ?
+              above_mi->mbmi.ref_frame[1] : left_mi->mbmi.ref_frame[1];
+
+          if (rfs == GOLDEN_FRAME) {
+            pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+          } else if (rfs == ALTREF_FRAME) {
+            pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;
+          } else {
+            pred_context =
+                1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
+          }
+        }
+      } else if (above_in_image || left_in_image) {  // one edge available
+        const MODE_INFO *edge = above_in_image ? above_mi : left_mi;
+
+        if (edge->mbmi.ref_frame[0] == INTRA_FRAME ||
+            (edge->mbmi.ref_frame[0] == LAST_FRAME &&
+             edge->mbmi.ref_frame[1] <= INTRA_FRAME)) {
+          pred_context = 2;
+        } else if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) {
+          pred_context = 4 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME);
+        } else {
+          pred_context = 3 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME ||
+                              edge->mbmi.ref_frame[1] == GOLDEN_FRAME);
+        }
+      } else {  // no edges available (2)
+        pred_context = 2;
       }
+      assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
       break;
+    }
 
+    case PRED_TX_SIZE: {
+      int above_context, left_context;
+      int max_tx_size;
+      if (mi->mbmi.sb_type < BLOCK_SIZE_SB8X8)
+        max_tx_size = TX_4X4;
+      else if (mi->mbmi.sb_type < BLOCK_SIZE_MB16X16)
+        max_tx_size = TX_8X8;
+      else if (mi->mbmi.sb_type < BLOCK_SIZE_SB32X32)
+        max_tx_size = TX_16X16;
+      else
+        max_tx_size = TX_32X32;
+      above_context = left_context = max_tx_size;
+      if (above_in_image) {
+        above_context = (above_mi->mbmi.mb_skip_coeff ?
+                         max_tx_size : above_mi->mbmi.txfm_size);
+      }
+      if (left_in_image) {
+        left_context = (left_mi->mbmi.mb_skip_coeff ?
+                        max_tx_size : left_mi->mbmi.txfm_size);
+      }
+      if (!left_in_image) {
+        left_context = above_context;
+      }
+      if (!above_in_image) {
+        above_context = left_context;
+      }
+      pred_context = (above_context + left_context > max_tx_size);
+      break;
+    }
+
     default:
+      assert(0);
       pred_context = 0;  // *** add error trap code.
       break;
   }
@@ -117,16 +397,20 @@
   switch (pred_id) {
     case PRED_SEG_ID:
       return cm->segment_pred_probs[pred_context];
-    case PRED_REF:
-      return cm->ref_pred_probs[pred_context];
-    case PRED_COMP:
-      // In keeping with convention elsewhre the probability returned is
-      // the probability of a "0" outcome which in this case means the
-      // probability of comp pred off.
-      return cm->prob_comppred[pred_context];
     case PRED_MBSKIP:
-      return cm->mbskip_pred_probs[pred_context];
+      return cm->fc.mbskip_probs[pred_context];
+    case PRED_INTRA_INTER:
+      return cm->fc.intra_inter_prob[pred_context];
+    case PRED_COMP_INTER_INTER:
+      return cm->fc.comp_inter_prob[pred_context];
+    case PRED_COMP_REF_P:
+      return cm->fc.comp_ref_prob[pred_context];
+    case PRED_SINGLE_REF_P1:
+      return cm->fc.single_ref_prob[pred_context][0];
+    case PRED_SINGLE_REF_P2:
+      return cm->fc.single_ref_prob[pred_context][1];
     default:
+      assert(0);
       return 128;  // *** add error trap code.
   }
 }
@@ -136,23 +420,23 @@
 const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,
                                    const MACROBLOCKD *const xd,
                                    PRED_ID pred_id) {
+  const MODE_INFO *const mi = xd->mode_info_context;
   const int pred_context = vp9_get_pred_context(cm, xd, pred_id);
 
   switch (pred_id) {
-    case PRED_SEG_ID:
-      return &cm->segment_pred_probs[pred_context];
-    case PRED_REF:
-      return &cm->ref_pred_probs[pred_context];
-    case PRED_COMP:
-      // In keeping with convention elsewhre the probability returned is
-      // the probability of a "0" outcome which in this case means the
-      // probability of comp pred off.
-      return &cm->prob_comppred[pred_context];
-    case PRED_MBSKIP:
-      return &cm->mbskip_pred_probs[pred_context];
     case PRED_SWITCHABLE_INTERP:
       return &cm->fc.switchable_interp_prob[pred_context][0];
+
+    case PRED_TX_SIZE:
+      if (mi->mbmi.sb_type < BLOCK_SIZE_MB16X16)
+        return cm->fc.tx_probs_8x8p[pred_context];
+      else if (mi->mbmi.sb_type < BLOCK_SIZE_SB32X32)
+        return cm->fc.tx_probs_16x16p[pred_context];
+      else
+        return cm->fc.tx_probs_32x32p[pred_context];
+
     default:
+      assert(0);
       return NULL;  // *** add error trap code.
   }
 }
@@ -164,11 +448,10 @@
   switch (pred_id) {
     case PRED_SEG_ID:
       return xd->mode_info_context->mbmi.seg_id_predicted;
-    case PRED_REF:
-      return  xd->mode_info_context->mbmi.ref_predicted;
     case PRED_MBSKIP:
       return xd->mode_info_context->mbmi.mb_skip_coeff;
     default:
+      assert(0);
       return 0;  // *** add error trap code.
   }
 }
@@ -179,59 +462,34 @@
                        PRED_ID pred_id,
                        unsigned char pred_flag) {
   const int mis = xd->mode_info_stride;
+  BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+  const int bh = 1 << mi_height_log2(bsize);
+  const int bw = 1 << mi_width_log2(bsize);
+#define sub(a, b) (b) < 0 ? (a) + (b) : (a)
+  const int x_mis = sub(bw, xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE));
+  const int y_mis = sub(bh, xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE));
+#undef sub
+  int x, y;
 
   switch (pred_id) {
     case PRED_SEG_ID:
-      xd->mode_info_context->mbmi.seg_id_predicted = pred_flag;
-      if (xd->mode_info_context->mbmi.sb_type) {
-#define sub(a, b) (b) < 0 ? (a) + (b) : (a)
-        const int n_mbs = 1 << xd->mode_info_context->mbmi.sb_type;
-        const int x_mbs = sub(n_mbs, xd->mb_to_right_edge >> 7);
-        const int y_mbs = sub(n_mbs, xd->mb_to_bottom_edge >> 7);
-        int x, y;
-
-        for (y = 0; y < y_mbs; y++) {
-          for (x = !y; x < x_mbs; x++) {
-            xd->mode_info_context[y * mis + x].mbmi.seg_id_predicted =
-                pred_flag;
-          }
+      for (y = 0; y < y_mis; y++) {
+        for (x = 0; x < x_mis; x++) {
+          xd->mode_info_context[y * mis + x].mbmi.seg_id_predicted = pred_flag;
         }
       }
       break;
 
-    case PRED_REF:
-      xd->mode_info_context->mbmi.ref_predicted = pred_flag;
-      if (xd->mode_info_context->mbmi.sb_type) {
-        const int n_mbs = 1 << xd->mode_info_context->mbmi.sb_type;
-        const int x_mbs = sub(n_mbs, xd->mb_to_right_edge >> 7);
-        const int y_mbs = sub(n_mbs, xd->mb_to_bottom_edge >> 7);
-        int x, y;
-
-        for (y = 0; y < y_mbs; y++) {
-          for (x = !y; x < x_mbs; x++) {
-            xd->mode_info_context[y * mis + x].mbmi.ref_predicted = pred_flag;
-          }
-        }
-      }
-      break;
-
     case PRED_MBSKIP:
-      xd->mode_info_context->mbmi.mb_skip_coeff = pred_flag;
-      if (xd->mode_info_context->mbmi.sb_type) {
-        const int n_mbs = 1 << xd->mode_info_context->mbmi.sb_type;
-        const int x_mbs = sub(n_mbs, xd->mb_to_right_edge >> 7);
-        const int y_mbs = sub(n_mbs, xd->mb_to_bottom_edge >> 7);
-        int x, y;
-
-        for (y = 0; y < y_mbs; y++) {
-          for (x = !y; x < x_mbs; x++) {
-            xd->mode_info_context[y * mis + x].mbmi.mb_skip_coeff = pred_flag;
-          }
+      for (y = 0; y < y_mis; y++) {
+        for (x = 0; x < x_mis; x++) {
+          xd->mode_info_context[y * mis + x].mbmi.mb_skip_coeff = pred_flag;
         }
       }
       break;
 
     default:
+      assert(0);
       // *** add error trap code.
       break;
   }
@@ -242,162 +500,21 @@
 // peredict various bitstream signals.
 
 // Macroblock segment id prediction function
-unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,
-                                    const MACROBLOCKD *const xd, int MbIndex) {
-  // Currently the prediction for the macroblock segment ID is
-  // the value stored for this macroblock in the previous frame.
-  if (!xd->mode_info_context->mbmi.sb_type) {
-    return cm->last_frame_seg_map[MbIndex];
-  } else {
-    const int n_mbs = 1 << xd->mode_info_context->mbmi.sb_type;
-    const int mb_col = MbIndex % cm->mb_cols;
-    const int mb_row = MbIndex / cm->mb_cols;
-    const int x_mbs = MIN(n_mbs, cm->mb_cols - mb_col);
-    const int y_mbs = MIN(n_mbs, cm->mb_rows - mb_row);
-    int x, y;
-    unsigned seg_id = -1;
+int vp9_get_pred_mi_segid(VP9_COMMON *cm, BLOCK_SIZE_TYPE sb_type,
+                          int mi_row, int mi_col) {
+  const int mi_index = mi_row * cm->mi_cols + mi_col;
+  const int bw = 1 << mi_width_log2(sb_type);
+  const int bh = 1 << mi_height_log2(sb_type);
+  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+  const int xmis = MIN(cm->mi_cols - mi_col, bw);
+  int segment_id = INT_MAX;
+  int x, y;
 
-    for (y = mb_row; y < mb_row + y_mbs; y++) {
-      for (x = mb_col; x < mb_col + x_mbs; x++) {
-        seg_id = MIN(seg_id, cm->last_frame_seg_map[cm->mb_cols * y + x]);
-      }
+  for (y = 0; y < ymis; y++) {
+    for (x = 0; x < xmis; x++) {
+      const int index = mi_index + (y * cm->mi_cols + x);
+      segment_id = MIN(segment_id, cm->last_frame_seg_map[index]);
     }
-
-    return seg_id;
   }
-}
-
-MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,
-                                    const MACROBLOCKD *const xd) {
-  MODE_INFO *m = xd->mode_info_context;
-
-  MV_REFERENCE_FRAME left;
-  MV_REFERENCE_FRAME above;
-  MV_REFERENCE_FRAME above_left;
-  MV_REFERENCE_FRAME pred_ref = LAST_FRAME;
-
-  int segment_id = xd->mode_info_context->mbmi.segment_id;
-  int i;
-
-  unsigned char frame_allowed[MAX_REF_FRAMES] = {1, 1, 1, 1};
-  unsigned char ref_score[MAX_REF_FRAMES];
-  unsigned char best_score = 0;
-  unsigned char left_in_image;
-  unsigned char above_in_image;
-  unsigned char above_left_in_image;
-
-  // Is segment coding ennabled
-  int seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
-
-  // Special case treatment if segment coding is enabled.
-  // Dont allow prediction of a reference frame that the segment
-  // does not allow
-  if (seg_ref_active) {
-    for (i = 0; i < MAX_REF_FRAMES; i++) {
-      frame_allowed[i] =
-        vp9_check_segref(xd, segment_id, i);
-
-      // Score set to 0 if ref frame not allowed
-      ref_score[i] = cm->ref_scores[i] * frame_allowed[i];
-    }
-  } else
-    vpx_memcpy(ref_score, cm->ref_scores, sizeof(ref_score));
-
-  // Reference frames used by neighbours
-  left = (m - 1)->mbmi.ref_frame;
-  above = (m - cm->mode_info_stride)->mbmi.ref_frame;
-  above_left = (m - 1 - cm->mode_info_stride)->mbmi.ref_frame;
-
-  // Are neighbours in image
-  left_in_image = (m - 1)->mbmi.mb_in_image && xd->left_available;
-  above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;
-  above_left_in_image = (m - 1 - cm->mode_info_stride)->mbmi.mb_in_image &&
-                        xd->left_available;
-
-  // Adjust scores for candidate reference frames based on neigbours
-  if (frame_allowed[left] && left_in_image) {
-    ref_score[left] += 16;
-    if (above_left_in_image && (left == above_left))
-      ref_score[left] += 4;
-  }
-  if (frame_allowed[above] && above_in_image) {
-    ref_score[above] += 16;
-    if (above_left_in_image && (above == above_left))
-      ref_score[above] += 4;
-  }
-
-  // Now choose the candidate with the highest score
-  for (i = 0; i < MAX_REF_FRAMES; i++) {
-    if (ref_score[i] > best_score) {
-      pred_ref = i;
-      best_score = ref_score[i];
-    }
-  }
-
-  return pred_ref;
-}
-
-// Functions to computes a set of modified reference frame probabilities
-// to use when the prediction of the reference frame value fails
-void vp9_calc_ref_probs(int *count, vp9_prob *probs) {
-  int tot_count = count[0] + count[1] + count[2] + count[3];
-  probs[0] = get_prob(count[0], tot_count);
-
-  tot_count -= count[0];
-  probs[1] = get_prob(count[1], tot_count);
-
-  tot_count -= count[1];
-  probs[2] = get_prob(count[2], tot_count);
-}
-
-// Computes a set of modified conditional probabilities for the reference frame
-// Values willbe set to 0 for reference frame options that are not possible
-// because wither they were predicted and prediction has failed or because
-// they are not allowed for a given segment.
-void vp9_compute_mod_refprobs(VP9_COMMON *const cm) {
-  int norm_cnt[MAX_REF_FRAMES];
-  const int intra_count = cm->prob_intra_coded;
-  const int inter_count = (255 - intra_count);
-  const int last_count = (inter_count * cm->prob_last_coded) / 255;
-  const int gfarf_count = inter_count - last_count;
-  const int gf_count = (gfarf_count * cm->prob_gf_coded) / 255;
-  const int arf_count = gfarf_count - gf_count;
-
-  // Work out modified reference frame probabilities to use where prediction
-  // of the reference frame fails
-  norm_cnt[0] = 0;
-  norm_cnt[1] = last_count;
-  norm_cnt[2] = gf_count;
-  norm_cnt[3] = arf_count;
-  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[INTRA_FRAME]);
-  cm->mod_refprobs[INTRA_FRAME][0] = 0;    // This branch implicit
-
-  norm_cnt[0] = intra_count;
-  norm_cnt[1] = 0;
-  norm_cnt[2] = gf_count;
-  norm_cnt[3] = arf_count;
-  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[LAST_FRAME]);
-  cm->mod_refprobs[LAST_FRAME][1] = 0;    // This branch implicit
-
-  norm_cnt[0] = intra_count;
-  norm_cnt[1] = last_count;
-  norm_cnt[2] = 0;
-  norm_cnt[3] = arf_count;
-  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[GOLDEN_FRAME]);
-  cm->mod_refprobs[GOLDEN_FRAME][2] = 0;  // This branch implicit
-
-  norm_cnt[0] = intra_count;
-  norm_cnt[1] = last_count;
-  norm_cnt[2] = gf_count;
-  norm_cnt[3] = 0;
-  vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[ALTREF_FRAME]);
-  cm->mod_refprobs[ALTREF_FRAME][2] = 0;  // This branch implicit
-
-  // Score the reference frames based on overal frequency.
-  // These scores contribute to the prediction choices.
-  // Max score 17 min 1
-  cm->ref_scores[INTRA_FRAME] = 1 + (intra_count * 16 / 255);
-  cm->ref_scores[LAST_FRAME] = 1 + (last_count * 16 / 255);
-  cm->ref_scores[GOLDEN_FRAME] = 1 + (gf_count * 16 / 255);
-  cm->ref_scores[ALTREF_FRAME] = 1 + (arf_count * 16 / 255);
+  return segment_id;
 }
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -17,10 +17,14 @@
 // Predicted items
 typedef enum {
   PRED_SEG_ID = 0,  // Segment identifier
-  PRED_REF = 1,
-  PRED_COMP = 2,
-  PRED_MBSKIP = 3,
-  PRED_SWITCHABLE_INTERP = 4
+  PRED_MBSKIP = 1,
+  PRED_SWITCHABLE_INTERP = 2,
+  PRED_INTRA_INTER = 3,
+  PRED_COMP_INTER_INTER = 4,
+  PRED_SINGLE_REF_P1 = 5,
+  PRED_SINGLE_REF_P2 = 6,
+  PRED_COMP_REF_P = 7,
+  PRED_TX_SIZE = 8
 } PRED_ID;
 
 unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,
@@ -43,13 +47,7 @@
                        unsigned char pred_flag);
 
 
-unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,
-                                    const MACROBLOCKD *const xd,
-                                    int MbIndex);
-
-MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,
-                                    const MACROBLOCKD *const xd);
-
-void vp9_compute_mod_refprobs(VP9_COMMON *const cm);
+int vp9_get_pred_mi_segid(VP9_COMMON *cm, BLOCK_SIZE_TYPE sb_type,
+                          int mi_row, int mi_col);
 
 #endif  // VP9_COMMON_VP9_PRED_COMMON_H_
--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c
@@ -10,46 +10,60 @@
 
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_seg_common.h"
 
-static int dc_qlookup[QINDEX_RANGE];
-static int ac_qlookup[QINDEX_RANGE];
+static int16_t dc_qlookup[QINDEX_RANGE];
+static int16_t ac_qlookup[QINDEX_RANGE];
 
-#define ACDC_MIN 4
+#define ACDC_MIN 8
 
+// TODO(dkovalev) move to common and reuse
+static double poly3(double a, double b, double c, double d, double x) {
+  return a*x*x*x + b*x*x + c*x + d;
+}
+
 void vp9_init_quant_tables() {
-  int i;
-  int current_val = 4;
-  int last_val = 4;
-  int ac_val;
+  int i, val = 4;
 
-  for (i = 0; i < QINDEX_RANGE; i++) {
-    ac_qlookup[i] = current_val;
-    current_val = (int)(current_val * 1.02);
-    if (current_val == last_val)
-      current_val++;
-    last_val = current_val;
+  // A "real" q of 1.0 forces lossless mode.
+  // In practice non lossless Q's between 1.0 and 2.0 (represented here by
+  // integer values from 5-7 give poor rd results (lower psnr and often
+  // larger size than the lossless encode. To block out those "not very useful"
+  // values we increment the ac and dc q lookup values by 4 after position 0.
+  ac_qlookup[0] = val;
+  dc_qlookup[0] = val;
+  val += 4;
 
-    ac_val = ac_qlookup[i];
-    dc_qlookup[i] = (int)((0.000000305 * ac_val * ac_val * ac_val) +
-                          (-0.00065 * ac_val * ac_val) +
-                          (0.9 * ac_val) + 0.5);
-    if (dc_qlookup[i] < ACDC_MIN)
-      dc_qlookup[i] = ACDC_MIN;
+  for (i = 1; i < QINDEX_RANGE; i++) {
+    const int ac_val = val;
+
+    val = (int)(val * 1.01975);
+    if (val == ac_val)
+      ++val;
+
+    ac_qlookup[i] = (int16_t)ac_val;
+    dc_qlookup[i] = (int16_t)MAX(ACDC_MIN, poly3(0.000000305, -0.00065, 0.9,
+                                                 0.5, ac_val));
   }
 }
 
-int vp9_dc_quant(int qindex, int delta) {
+int16_t vp9_dc_quant(int qindex, int delta) {
   return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
 }
 
-int vp9_dc_uv_quant(int qindex, int delta) {
-  return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];
+int16_t vp9_ac_quant(int qindex, int delta) {
+  return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];
 }
 
-int vp9_ac_yquant(int qindex) {
-  return ac_qlookup[clamp(qindex, 0, MAXQ)];
-}
 
-int vp9_ac_uv_quant(int qindex, int delta) {
-  return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];
+int vp9_get_qindex(MACROBLOCKD *xd, int segment_id, int base_qindex) {
+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {
+    const int data = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
+    return xd->mb_segment_abs_delta == SEGMENT_ABSDATA ?
+               data :  // Abs value
+               clamp(base_qindex + data, 0, MAXQ);  // Delta value
+  } else {
+    return base_qindex;
+  }
 }
+
--- a/vp9/common/vp9_quant_common.h
+++ b/vp9/common/vp9_quant_common.h
@@ -12,14 +12,17 @@
 #define VP9_COMMON_VP9_QUANT_COMMON_H_
 
 #include "vp9/common/vp9_blockd.h"
-#include "vp9/common/vp9_onyxc_int.h"
 
+#define MINQ 0
+#define MAXQ 255
+#define QINDEX_RANGE (MAXQ - MINQ + 1)
+#define QINDEX_BITS 8
+
 void vp9_init_quant_tables();
-int vp9_ac_yquant(int qindex);
-int vp9_dc_quant(int qindex, int delta);
-int vp9_dc2quant(int qindex, int delta);
-int vp9_ac2quant(int qindex, int delta);
-int vp9_dc_uv_quant(int qindex, int delta);
-int vp9_ac_uv_quant(int qindex, int delta);
+
+int16_t vp9_dc_quant(int qindex, int delta);
+int16_t vp9_ac_quant(int qindex, int delta);
+
+int vp9_get_qindex(MACROBLOCKD *mb, int segment_id, int base_qindex);
 
 #endif  // VP9_COMMON_VP9_QUANT_COMMON_H_
--- a/vp9/common/vp9_recon.c
+++ /dev/null
@@ -1,202 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "./vpx_config.h"
-#include "vp9_rtcd.h"
-#include "vp9/common/vp9_blockd.h"
-
-void vp9_recon_b_c(uint8_t *pred_ptr,
-                   int16_t *diff_ptr,
-                   uint8_t *dst_ptr,
-                   int stride) {
-  int r, c;
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
-      dst_ptr[c] = clip_pixel(diff_ptr[c] + pred_ptr[c]);
-    }
-
-    dst_ptr += stride;
-    diff_ptr += 16;
-    pred_ptr += 16;
-  }
-}
-
-void vp9_recon_uv_b_c(uint8_t *pred_ptr,
-                      int16_t *diff_ptr,
-                      uint8_t *dst_ptr,
-                      int stride) {
-  int r, c;
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
-      dst_ptr[c] = clip_pixel(diff_ptr[c] + pred_ptr[c]);
-    }
-
-    dst_ptr += stride;
-    diff_ptr += 8;
-    pred_ptr += 8;
-  }
-}
-
-void vp9_recon4b_c(uint8_t *pred_ptr,
-                   int16_t *diff_ptr,
-                   uint8_t *dst_ptr,
-                   int stride) {
-  int r, c;
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 16; c++) {
-      dst_ptr[c] = clip_pixel(diff_ptr[c] + pred_ptr[c]);
-    }
-
-    dst_ptr += stride;
-    diff_ptr += 16;
-    pred_ptr += 16;
-  }
-}
-
-void vp9_recon2b_c(uint8_t *pred_ptr,
-                   int16_t *diff_ptr,
-                   uint8_t *dst_ptr,
-                   int stride) {
-  int r, c;
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 8; c++) {
-      dst_ptr[c] = clip_pixel(diff_ptr[c] + pred_ptr[c]);
-    }
-
-    dst_ptr += stride;
-    diff_ptr += 8;
-    pred_ptr += 8;
-  }
-}
-
-void vp9_recon_mby_s_c(MACROBLOCKD *xd, uint8_t *dst) {
-  int x, y;
-  BLOCKD *b = &xd->block[0];
-  int stride = b->dst_stride;
-  int16_t *diff = b->diff;
-
-  for (y = 0; y < 16; y++) {
-    for (x = 0; x < 16; x++) {
-      dst[x] = clip_pixel(dst[x] + diff[x]);
-    }
-    dst += stride;
-    diff += 16;
-  }
-}
-
-void vp9_recon_mbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {
-  int x, y, i;
-  uint8_t *dst = udst;
-
-  for (i = 0; i < 2; i++, dst = vdst) {
-    BLOCKD *b = &xd->block[16 + 4 * i];
-    int stride = b->dst_stride;
-    int16_t *diff = b->diff;
-
-    for (y = 0; y < 8; y++) {
-      for (x = 0; x < 8; x++) {
-        dst[x] = clip_pixel(dst[x] + diff[x]);
-      }
-      dst += stride;
-      diff += 8;
-    }
-  }
-}
-
-void vp9_recon_sby_s_c(MACROBLOCKD *xd, uint8_t *dst) {
-  int x, y, stride = xd->block[0].dst_stride;
-  int16_t *diff = xd->diff;
-
-  for (y = 0; y < 32; y++) {
-    for (x = 0; x < 32; x++) {
-      dst[x] = clip_pixel(dst[x] + diff[x]);
-    }
-    dst += stride;
-    diff += 32;
-  }
-}
-
-void vp9_recon_sbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {
-  int x, y, stride = xd->block[16].dst_stride;
-  int16_t *udiff = xd->diff + 1024;
-  int16_t *vdiff = xd->diff + 1280;
-
-  for (y = 0; y < 16; y++) {
-    for (x = 0; x < 16; x++) {
-      udst[x] = clip_pixel(udst[x] + udiff[x]);
-      vdst[x] = clip_pixel(vdst[x] + vdiff[x]);
-    }
-    udst += stride;
-    vdst += stride;
-    udiff += 16;
-    vdiff += 16;
-  }
-}
-
-void vp9_recon_sb64y_s_c(MACROBLOCKD *xd, uint8_t *dst) {
-  int x, y, stride = xd->block[0].dst_stride;
-  int16_t *diff = xd->diff;
-
-  for (y = 0; y < 64; y++) {
-    for (x = 0; x < 64; x++) {
-      dst[x] = clip_pixel(dst[x] + diff[x]);
-    }
-    dst += stride;
-    diff += 64;
-  }
-}
-
-void vp9_recon_sb64uv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {
-  int x, y, stride = xd->block[16].dst_stride;
-  int16_t *udiff = xd->diff + 4096;
-  int16_t *vdiff = xd->diff + 4096 + 1024;
-
-  for (y = 0; y < 32; y++) {
-    for (x = 0; x < 32; x++) {
-      udst[x] = clip_pixel(udst[x] + udiff[x]);
-      vdst[x] = clip_pixel(vdst[x] + vdiff[x]);
-    }
-    udst += stride;
-    vdst += stride;
-    udiff += 32;
-    vdiff += 32;
-  }
-}
-
-void vp9_recon_mby_c(MACROBLOCKD *xd) {
-  int i;
-
-  for (i = 0; i < 16; i += 4) {
-    BLOCKD *b = &xd->block[i];
-
-    vp9_recon4b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-  }
-}
-
-void vp9_recon_mb_c(MACROBLOCKD *xd) {
-  int i;
-
-  for (i = 0; i < 16; i += 4) {
-    BLOCKD *b = &xd->block[i];
-
-    vp9_recon4b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-  }
-
-  for (i = 16; i < 24; i += 2) {
-    BLOCKD *b = &xd->block[i];
-
-    vp9_recon2b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-  }
-}
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -17,22 +17,110 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
+static int scale_value_x_with_scaling(int val,
+                                      const struct scale_factors *scale) {
+  return (val * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT);
+}
+
+static int scale_value_y_with_scaling(int val,
+                                      const struct scale_factors *scale) {
+  return (val * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT);
+}
+
+static int unscaled_value(int val, const struct scale_factors *scale) {
+  (void) scale;
+  return val;
+}
+
+static int_mv32 mv_q3_to_q4_with_scaling(const int_mv *src_mv,
+                                         const struct scale_factors *scale) {
+  // returns mv * scale + offset
+  int_mv32 result;
+  const int32_t mv_row_q4 = src_mv->as_mv.row << 1;
+  const int32_t mv_col_q4 = src_mv->as_mv.col << 1;
+
+  result.as_mv.row = (mv_row_q4 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT)
+                      + scale->y_offset_q4;
+  result.as_mv.col = (mv_col_q4 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT)
+                      + scale->x_offset_q4;
+  return result;
+}
+
+static int_mv32 mv_q3_to_q4_without_scaling(const int_mv *src_mv,
+                                            const struct scale_factors *scale) {
+  // returns mv * scale + offset
+  int_mv32 result;
+
+  result.as_mv.row = src_mv->as_mv.row << 1;
+  result.as_mv.col = src_mv->as_mv.col << 1;
+  return result;
+}
+
+static int32_t mv_component_q4_with_scaling(int mv_q4, int scale_fp,
+                                            int offset_q4) {
+  int32_t scaled_mv;
+  // returns the scaled and offset value of the mv component.
+  scaled_mv = (mv_q4 * scale_fp >> VP9_REF_SCALE_SHIFT) + offset_q4;
+
+  return scaled_mv;
+}
+
+static int32_t mv_component_q4_without_scaling(int mv_q4, int scale_fp,
+                                               int offset_q4) {
+  // returns the scaled and offset value of the mv component.
+  (void)scale_fp;
+  (void)offset_q4;
+  return mv_q4;
+}
+
+static void set_offsets_with_scaling(struct scale_factors *scale,
+                                     int row, int col) {
+  const int x_q4 = 16 * col;
+  const int y_q4 = 16 * row;
+
+  scale->x_offset_q4 = (x_q4 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) & 0xf;
+  scale->y_offset_q4 = (y_q4 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) & 0xf;
+}
+
+static void set_offsets_without_scaling(struct scale_factors *scale,
+                                        int row, int col) {
+  scale->x_offset_q4 = 0;
+  scale->y_offset_q4 = 0;
+}
+
+static int get_fixed_point_scale_factor(int other_size, int this_size) {
+  // Calculate scaling factor once for each reference frame
+  // and use fixed point scaling factors in decoding and encoding routines.
+  // Hardware implementations can calculate scale factor in device driver
+  // and use multiplication and shifting on hardware instead of division.
+  return (other_size << VP9_REF_SCALE_SHIFT) / this_size;
+}
+
 void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
-                                       YV12_BUFFER_CONFIG *other,
+                                       int other_w, int other_h,
                                        int this_w, int this_h) {
-  int other_h = other->y_crop_height;
-  int other_w = other->y_crop_width;
-
-  scale->x_num = other_w;
-  scale->x_den = this_w;
+  scale->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
   scale->x_offset_q4 = 0;  // calculated per-mb
-  scale->x_step_q4 = 16 * other_w / this_w;
+  scale->x_step_q4 = (16 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT);
 
-  scale->y_num = other_h;
-  scale->y_den = this_h;
+  scale->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
   scale->y_offset_q4 = 0;  // calculated per-mb
-  scale->y_step_q4 = 16 * other_h / this_h;
+  scale->y_step_q4 = (16 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT);
 
+  if ((other_w == this_w) && (other_h == this_h)) {
+    scale->scale_value_x = unscaled_value;
+    scale->scale_value_y = unscaled_value;
+    scale->set_scaled_offsets = set_offsets_without_scaling;
+    scale->scale_mv_q3_to_q4 = mv_q3_to_q4_without_scaling;
+    scale->scale_mv_component_q4 = mv_component_q4_without_scaling;
+  } else {
+    scale->scale_value_x = scale_value_x_with_scaling;
+    scale->scale_value_y = scale_value_y_with_scaling;
+    scale->set_scaled_offsets = set_offsets_with_scaling;
+    scale->scale_mv_q3_to_q4 = mv_q3_to_q4_with_scaling;
+    scale->scale_mv_component_q4 = mv_component_q4_with_scaling;
+  }
+
   // TODO(agrange): Investigate the best choice of functions to use here
   // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what
   // to do at full-pel offsets. The current selection, where the filter is
@@ -39,131 +127,10 @@
   // applied in one direction only, and not at all for 0,0, seems to give the
   // best quality, but it may be worth trying an additional mode that does
   // do the filtering on full-pel.
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
   if (scale->x_step_q4 == 16) {
     if (scale->y_step_q4 == 16) {
       // No scaling in either direction.
       scale->predict[0][0][0] = vp9_convolve_copy;
-      scale->predict[0][0][1] = vp9_convolve_1by8;
-      scale->predict[0][0][2] = vp9_convolve_qtr;
-      scale->predict[0][0][3] = vp9_convolve_3by8;
-      scale->predict[0][0][4] = vp9_convolve_avg;
-      scale->predict[0][0][5] = vp9_convolve_5by8;
-      scale->predict[0][0][6] = vp9_convolve_3qtr;
-      scale->predict[0][0][7] = vp9_convolve_7by8;
-      scale->predict[0][1][0] = vp9_convolve8_vert;
-      scale->predict[0][1][1] = vp9_convolve8_1by8_vert;
-      scale->predict[0][1][2] = vp9_convolve8_qtr_vert;
-      scale->predict[0][1][3] = vp9_convolve8_3by8_vert;
-      scale->predict[0][1][4] = vp9_convolve8_avg_vert;
-      scale->predict[0][1][5] = vp9_convolve8_5by8_vert;
-      scale->predict[0][1][6] = vp9_convolve8_3qtr_vert;
-      scale->predict[0][1][7] = vp9_convolve8_7by8_vert;
-      scale->predict[1][0][0] = vp9_convolve8_horiz;
-      scale->predict[1][0][1] = vp9_convolve8_1by8_horiz;
-      scale->predict[1][0][2] = vp9_convolve8_qtr_horiz;
-      scale->predict[1][0][3] = vp9_convolve8_3by8_horiz;
-      scale->predict[1][0][4] = vp9_convolve8_avg_horiz;
-      scale->predict[1][0][5] = vp9_convolve8_5by8_horiz;
-      scale->predict[1][0][6] = vp9_convolve8_3qtr_horiz;
-      scale->predict[1][0][7] = vp9_convolve8_7by8_horiz;
-    } else {
-      // No scaling in x direction. Must always scale in the y direction.
-      scale->predict[0][0][0] = vp9_convolve8_vert;
-      scale->predict[0][0][1] = vp9_convolve8_1by8_vert;
-      scale->predict[0][0][2] = vp9_convolve8_qtr_vert;
-      scale->predict[0][0][3] = vp9_convolve8_3by8_vert;
-      scale->predict[0][0][4] = vp9_convolve8_avg_vert;
-      scale->predict[0][0][5] = vp9_convolve8_5by8_vert;
-      scale->predict[0][0][6] = vp9_convolve8_3qtr_vert;
-      scale->predict[0][0][7] = vp9_convolve8_7by8_vert;
-      scale->predict[0][1][0] = vp9_convolve8_vert;
-      scale->predict[0][1][1] = vp9_convolve8_1by8_vert;
-      scale->predict[0][1][2] = vp9_convolve8_qtr_vert;
-      scale->predict[0][1][3] = vp9_convolve8_3by8_vert;
-      scale->predict[0][1][4] = vp9_convolve8_avg_vert;
-      scale->predict[0][1][5] = vp9_convolve8_5by8_vert;
-      scale->predict[0][1][6] = vp9_convolve8_3qtr_vert;
-      scale->predict[0][1][7] = vp9_convolve8_7by8_vert;
-      scale->predict[1][0][0] = vp9_convolve8;
-      scale->predict[1][0][1] = vp9_convolve8_1by8;
-      scale->predict[1][0][2] = vp9_convolve8_qtr;
-      scale->predict[1][0][3] = vp9_convolve8_3by8;
-      scale->predict[1][0][4] = vp9_convolve8_avg;
-      scale->predict[1][0][5] = vp9_convolve8_5by8;
-      scale->predict[1][0][6] = vp9_convolve8_3qtr;
-      scale->predict[1][0][7] = vp9_convolve8_7by8;
-    }
-  } else {
-    if (scale->y_step_q4 == 16) {
-      // No scaling in the y direction. Must always scale in the x direction.
-      scale->predict[0][0][0] = vp9_convolve8_horiz;
-      scale->predict[0][0][1] = vp9_convolve8_1by8_horiz;
-      scale->predict[0][0][2] = vp9_convolve8_qtr_horiz;
-      scale->predict[0][0][3] = vp9_convolve8_3by8_horiz;
-      scale->predict[0][0][4] = vp9_convolve8_avg_horiz;
-      scale->predict[0][0][5] = vp9_convolve8_5by8_horiz;
-      scale->predict[0][0][6] = vp9_convolve8_3qtr_horiz;
-      scale->predict[0][0][7] = vp9_convolve8_7by8_horiz;
-      scale->predict[0][1][0] = vp9_convolve8;
-      scale->predict[0][1][1] = vp9_convolve8_1by8;
-      scale->predict[0][1][2] = vp9_convolve8_qtr;
-      scale->predict[0][1][3] = vp9_convolve8_3by8;
-      scale->predict[0][1][4] = vp9_convolve8_avg;
-      scale->predict[0][1][5] = vp9_convolve8_5by8;
-      scale->predict[0][1][6] = vp9_convolve8_3qtr;
-      scale->predict[0][1][7] = vp9_convolve8_7by8;
-      scale->predict[1][0][0] = vp9_convolve8_horiz;
-      scale->predict[1][0][1] = vp9_convolve8_1by8_horiz;
-      scale->predict[1][0][2] = vp9_convolve8_qtr_horiz;
-      scale->predict[1][0][3] = vp9_convolve8_3by8_horiz;
-      scale->predict[1][0][4] = vp9_convolve8_avg_horiz;
-      scale->predict[1][0][5] = vp9_convolve8_5by8_horiz;
-      scale->predict[1][0][6] = vp9_convolve8_3qtr_horiz;
-      scale->predict[1][0][7] = vp9_convolve8_7by8_horiz;
-    } else {
-      // Must always scale in both directions.
-      scale->predict[0][0][0] = vp9_convolve8;
-      scale->predict[0][0][1] = vp9_convolve8_1by8;
-      scale->predict[0][0][2] = vp9_convolve8_qtr;
-      scale->predict[0][0][3] = vp9_convolve8_3by8;
-      scale->predict[0][0][4] = vp9_convolve8_avg;
-      scale->predict[0][0][5] = vp9_convolve8_5by8;
-      scale->predict[0][0][6] = vp9_convolve8_3qtr;
-      scale->predict[0][0][7] = vp9_convolve8_7by8;
-      scale->predict[0][1][0] = vp9_convolve8;
-      scale->predict[0][1][1] = vp9_convolve8_1by8;
-      scale->predict[0][1][2] = vp9_convolve8_qtr;
-      scale->predict[0][1][3] = vp9_convolve8_3by8;
-      scale->predict[0][1][4] = vp9_convolve8_avg;
-      scale->predict[0][1][5] = vp9_convolve8_5by8;
-      scale->predict[0][1][6] = vp9_convolve8_3qtr;
-      scale->predict[0][1][7] = vp9_convolve8_7by8;
-      scale->predict[1][0][0] = vp9_convolve8;
-      scale->predict[1][0][1] = vp9_convolve8_1by8;
-      scale->predict[1][0][2] = vp9_convolve8_qtr;
-      scale->predict[1][0][3] = vp9_convolve8_3by8;
-      scale->predict[1][0][4] = vp9_convolve8_avg;
-      scale->predict[1][0][5] = vp9_convolve8_5by8;
-      scale->predict[1][0][6] = vp9_convolve8_3qtr;
-      scale->predict[1][0][7] = vp9_convolve8_7by8;
-    }
-  }
-  // 2D subpel motion always gets filtered in both directions
-  scale->predict[1][1][0] = vp9_convolve8;
-  scale->predict[1][1][1] = vp9_convolve8_1by8;
-  scale->predict[1][1][2] = vp9_convolve8_qtr;
-  scale->predict[1][1][3] = vp9_convolve8_3by8;
-  scale->predict[1][1][4] = vp9_convolve8_avg;
-  scale->predict[1][1][5] = vp9_convolve8_5by8;
-  scale->predict[1][1][6] = vp9_convolve8_3qtr;
-  scale->predict[1][1][7] = vp9_convolve8_7by8;
-}
-#else
-  if (scale->x_step_q4 == 16) {
-    if (scale->y_step_q4 == 16) {
-      // No scaling in either direction.
-      scale->predict[0][0][0] = vp9_convolve_copy;
       scale->predict[0][0][1] = vp9_convolve_avg;
       scale->predict[0][1][0] = vp9_convolve8_vert;
       scale->predict[0][1][1] = vp9_convolve8_avg_vert;
@@ -201,35 +168,19 @@
   scale->predict[1][1][0] = vp9_convolve8;
   scale->predict[1][1][1] = vp9_convolve8_avg;
 }
-#endif
 
 void vp9_setup_interp_filters(MACROBLOCKD *xd,
                               INTERPOLATIONFILTERTYPE mcomp_filter_type,
                               VP9_COMMON *cm) {
-  int i;
-
-  /* Calculate scaling factors for each of the 3 available references */
-  for (i = 0; i < 3; ++i) {
-    if (cm->active_ref_idx[i] >= NUM_YV12_BUFFERS) {
-      memset(&cm->active_ref_scale[i], 0, sizeof(cm->active_ref_scale[i]));
-      continue;
-    }
-
-    vp9_setup_scale_factors_for_frame(&cm->active_ref_scale[i],
-                                      &cm->yv12_fb[cm->active_ref_idx[i]],
-                                      cm->width, cm->height);
-  }
-
   if (xd->mode_info_context) {
     MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
 
     set_scale_factors(xd,
-                      mbmi->ref_frame - 1,
-                      mbmi->second_ref_frame - 1,
+                      mbmi->ref_frame[0] - 1,
+                      mbmi->ref_frame[1] - 1,
                       cm->active_ref_scale);
   }
 
-
   switch (mcomp_filter_type) {
     case EIGHTTAP:
     case SWITCHABLE:
@@ -244,11 +195,6 @@
     case BILINEAR:
       xd->subpix.filter_x = xd->subpix.filter_y = vp9_bilinear_filters;
       break;
-#if CONFIG_ENABLE_6TAP
-    case SIXTAP:
-      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_6;
-      break;
-#endif
   }
   assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
 }
@@ -340,53 +286,6 @@
   }
 }
 
-static void set_scaled_offsets(struct scale_factors *scale,
-                               int row, int col) {
-  const int x_q4 = 16 * col;
-  const int y_q4 = 16 * row;
-
-  scale->x_offset_q4 = (x_q4 * scale->x_num / scale->x_den) & 0xf;
-  scale->y_offset_q4 = (y_q4 * scale->y_num / scale->y_den) & 0xf;
-}
-
-static int32_t scale_motion_vector_component_q3(int mv_q3,
-                                                int num,
-                                                int den,
-                                                int offset_q4) {
-  // returns the scaled and offset value of the mv component.
-  const int32_t mv_q4 = mv_q3 << 1;
-
-  /* TODO(jkoleszar): make fixed point, or as a second multiply? */
-  return mv_q4 * num / den + offset_q4;
-}
-
-static int32_t scale_motion_vector_component_q4(int mv_q4,
-                                                int num,
-                                                int den,
-                                                int offset_q4) {
-  // returns the scaled and offset value of the mv component.
-
-  /* TODO(jkoleszar): make fixed point, or as a second multiply? */
-  return mv_q4 * num / den + offset_q4;
-}
-
-static int_mv32 scale_motion_vector_q3_to_q4(
-    const int_mv *src_mv,
-    const struct scale_factors *scale) {
-  // returns mv * scale + offset
-  int_mv32 result;
-
-  result.as_mv.row = scale_motion_vector_component_q3(src_mv->as_mv.row,
-                                                      scale->y_num,
-                                                      scale->y_den,
-                                                      scale->y_offset_q4);
-  result.as_mv.col = scale_motion_vector_component_q3(src_mv->as_mv.col,
-                                                      scale->x_num,
-                                                      scale->x_den,
-                                                      scale->x_offset_q4);
-  return result;
-}
-
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
                                const int_mv *mv_q3,
@@ -393,7 +292,7 @@
                                const struct scale_factors *scale,
                                int w, int h, int weight,
                                const struct subpix_fn_table *subpix) {
-  int_mv32 mv = scale_motion_vector_q3_to_q4(mv_q3, scale);
+  int_mv32 mv = scale->scale_mv_q3_to_q4(mv_q3, scale);
   src += (mv.as_mv.row >> 4) * src_stride + (mv.as_mv.col >> 4);
   scale->predict[!!(mv.as_mv.col & 15)][!!(mv.as_mv.row & 15)][weight](
       src, src_stride, dst, dst_stride,
@@ -402,26 +301,18 @@
       w, h);
 }
 
-/* Like vp9_build_inter_predictor, but takes the full-pel part of the
- * mv separately, and the fractional part as a q4.
- */
 void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride,
                                   uint8_t *dst, int dst_stride,
-                                  const int_mv *fullpel_mv_q3,
-                                  const int_mv *frac_mv_q4,
+                                  const int_mv *mv_q4,
                                   const struct scale_factors *scale,
                                   int w, int h, int weight,
                                   const struct subpix_fn_table *subpix) {
-  const int mv_row_q4 = ((fullpel_mv_q3->as_mv.row >> 3) << 4)
-                        + (frac_mv_q4->as_mv.row & 0xf);
-  const int mv_col_q4 = ((fullpel_mv_q3->as_mv.col >> 3) << 4)
-                        + (frac_mv_q4->as_mv.col & 0xf);
-  const int scaled_mv_row_q4 =
-      scale_motion_vector_component_q4(mv_row_q4, scale->y_num, scale->y_den,
-                                       scale->y_offset_q4);
-  const int scaled_mv_col_q4 =
-      scale_motion_vector_component_q4(mv_col_q4, scale->x_num, scale->x_den,
-                                       scale->x_offset_q4);
+  const int scaled_mv_row_q4 = scale->scale_mv_component_q4(mv_q4->as_mv.row,
+                                                            scale->y_scale_fp,
+                                                            scale->y_offset_q4);
+  const int scaled_mv_col_q4 = scale->scale_mv_component_q4(mv_q4->as_mv.col,
+                                                            scale->x_scale_fp,
+                                                            scale->x_offset_q4);
   const int subpel_x = scaled_mv_col_q4 & 15;
   const int subpel_y = scaled_mv_row_q4 & 15;
 
@@ -433,1367 +324,205 @@
       w, h);
 }
 
-static void build_2x1_inter_predictor_wh(const BLOCKD *d0, const BLOCKD *d1,
-                                         struct scale_factors *scale,
-                                         uint8_t *predictor,
-                                         int block_size, int stride,
-                                         int which_mv, int weight,
-                                         int width, int height,
-                                         const struct subpix_fn_table *subpix,
-                                         int row, int col) {
-  assert(d1->predictor - d0->predictor == block_size);
-  assert(d1->pre == d0->pre + block_size);
+static INLINE int round_mv_comp_q4(int value) {
+  return (value < 0 ? value - 2 : value + 2) / 4;
+}
 
-  set_scaled_offsets(&scale[which_mv], row, col);
-
-  if (d0->bmi.as_mv[which_mv].as_int == d1->bmi.as_mv[which_mv].as_int) {
-    uint8_t **base_pre = which_mv ? d0->base_second_pre : d0->base_pre;
-
-    vp9_build_inter_predictor(*base_pre + d0->pre,
-                              d0->pre_stride,
-                              predictor, stride,
-                              &d0->bmi.as_mv[which_mv],
-                              &scale[which_mv],
-                              width, height,
-                              weight, subpix);
-
-  } else {
-    uint8_t **base_pre0 = which_mv ? d0->base_second_pre : d0->base_pre;
-    uint8_t **base_pre1 = which_mv ? d1->base_second_pre : d1->base_pre;
-
-    vp9_build_inter_predictor(*base_pre0 + d0->pre,
-                              d0->pre_stride,
-                              predictor, stride,
-                              &d0->bmi.as_mv[which_mv],
-                              &scale[which_mv],
-                              width > block_size ? block_size : width, height,
-                              weight, subpix);
-
-    if (width <= block_size) return;
-
-    set_scaled_offsets(&scale[which_mv], row, col + block_size);
-
-    vp9_build_inter_predictor(*base_pre1 + d1->pre,
-                              d1->pre_stride,
-                              predictor + block_size, stride,
-                              &d1->bmi.as_mv[which_mv],
-                              &scale[which_mv],
-                              width - block_size, height,
-                              weight, subpix);
-  }
+static int mi_mv_pred_row_q4(MACROBLOCKD *mb, int idx) {
+  const int temp = mb->mode_info_context->bmi[0].as_mv[idx].as_mv.row +
+                   mb->mode_info_context->bmi[1].as_mv[idx].as_mv.row +
+                   mb->mode_info_context->bmi[2].as_mv[idx].as_mv.row +
+                   mb->mode_info_context->bmi[3].as_mv[idx].as_mv.row;
+  return round_mv_comp_q4(temp);
 }
 
-static void build_2x1_inter_predictor(const BLOCKD *d0, const BLOCKD *d1,
-                                      struct scale_factors *scale,
-                                      int block_size, int stride,
-                                      int which_mv, int weight,
-                                      const struct subpix_fn_table *subpix,
-                                      int row, int col) {
-  assert(d1->predictor - d0->predictor == block_size);
-  assert(d1->pre == d0->pre + block_size);
-
-  set_scaled_offsets(&scale[which_mv], row, col);
-
-  if (d0->bmi.as_mv[which_mv].as_int == d1->bmi.as_mv[which_mv].as_int) {
-    uint8_t **base_pre = which_mv ? d0->base_second_pre : d0->base_pre;
-
-    vp9_build_inter_predictor(*base_pre + d0->pre,
-                              d0->pre_stride,
-                              d0->predictor, stride,
-                              &d0->bmi.as_mv[which_mv],
-                              &scale[which_mv],
-                              2 * block_size, block_size,
-                              weight, subpix);
-
-  } else {
-    uint8_t **base_pre0 = which_mv ? d0->base_second_pre : d0->base_pre;
-    uint8_t **base_pre1 = which_mv ? d1->base_second_pre : d1->base_pre;
-
-    vp9_build_inter_predictor(*base_pre0 + d0->pre,
-                              d0->pre_stride,
-                              d0->predictor, stride,
-                              &d0->bmi.as_mv[which_mv],
-                              &scale[which_mv],
-                              block_size, block_size,
-                              weight, subpix);
-
-    set_scaled_offsets(&scale[which_mv], row, col + block_size);
-
-    vp9_build_inter_predictor(*base_pre1 + d1->pre,
-                              d1->pre_stride,
-                              d1->predictor, stride,
-                              &d1->bmi.as_mv[which_mv],
-                              &scale[which_mv],
-                              block_size, block_size,
-                              weight, subpix);
-  }
+static int mi_mv_pred_col_q4(MACROBLOCKD *mb, int idx) {
+  const int temp = mb->mode_info_context->bmi[0].as_mv[idx].as_mv.col +
+                   mb->mode_info_context->bmi[1].as_mv[idx].as_mv.col +
+                   mb->mode_info_context->bmi[2].as_mv[idx].as_mv.col +
+                   mb->mode_info_context->bmi[3].as_mv[idx].as_mv.col;
+  return round_mv_comp_q4(temp);
 }
 
-static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {
+// TODO(jkoleszar): yet another mv clamping function :-(
+MV clamp_mv_to_umv_border_sb(const MV *src_mv,
+    int bwl, int bhl, int ss_x, int ss_y,
+    int mb_to_left_edge, int mb_to_top_edge,
+    int mb_to_right_edge, int mb_to_bottom_edge) {
   /* If the MV points so far into the UMV border that no visible pixels
    * are used for reconstruction, the subpel part of the MV can be
    * discarded and the MV limited to 16 pixels with equivalent results.
-   *
-   * This limit kicks in at 19 pixels for the top and left edges, for
-   * the 16 pixels plus 3 taps right of the central pixel when subpel
-   * filtering. The bottom and right edges use 16 pixels plus 2 pixels
-   * left of the central pixel when filtering.
    */
-  if (mv->col < (xd->mb_to_left_edge - ((16 + VP9_INTERP_EXTEND) << 3)))
-    mv->col = xd->mb_to_left_edge - (16 << 3);
-  else if (mv->col > xd->mb_to_right_edge + ((15 + VP9_INTERP_EXTEND) << 3))
-    mv->col = xd->mb_to_right_edge + (16 << 3);
+  const int spel_left = (VP9_INTERP_EXTEND + (4 << bwl)) << 4;
+  const int spel_right = spel_left - (1 << 4);
+  const int spel_top = (VP9_INTERP_EXTEND + (4 << bhl)) << 4;
+  const int spel_bottom = spel_top - (1 << 4);
+  MV clamped_mv;
 
-  if (mv->row < (xd->mb_to_top_edge - ((16 + VP9_INTERP_EXTEND) << 3)))
-    mv->row = xd->mb_to_top_edge - (16 << 3);
-  else if (mv->row > xd->mb_to_bottom_edge + ((15 + VP9_INTERP_EXTEND) << 3))
-    mv->row = xd->mb_to_bottom_edge + (16 << 3);
+  assert(ss_x <= 1);
+  assert(ss_y <= 1);
+  clamped_mv.col = clamp(src_mv->col << (1 - ss_x),
+                         (mb_to_left_edge << (1 - ss_x)) - spel_left,
+                         (mb_to_right_edge << (1 - ss_x)) + spel_right);
+  clamped_mv.row = clamp(src_mv->row << (1 - ss_y),
+                         (mb_to_top_edge << (1 - ss_y)) - spel_top,
+                         (mb_to_bottom_edge << (1 - ss_y)) + spel_bottom);
+  return clamped_mv;
 }
 
-/* A version of the above function for chroma block MVs.*/
-static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {
-  const int extend = VP9_INTERP_EXTEND;
+struct build_inter_predictors_args {
+  MACROBLOCKD *xd;
+  int x;
+  int y;
+  uint8_t* dst[MAX_MB_PLANE];
+  int dst_stride[MAX_MB_PLANE];
+  uint8_t* pre[2][MAX_MB_PLANE];
+  int pre_stride[2][MAX_MB_PLANE];
+};
+static void build_inter_predictors(int plane, int block,
+                                   BLOCK_SIZE_TYPE bsize,
+                                   int pred_w, int pred_h,
+                                   void *argv) {
+  const struct build_inter_predictors_args* const arg = argv;
+  MACROBLOCKD * const xd = arg->xd;
+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
+  const int bh = 4 << bhl, bw = 4 << bwl;
+  const int x = 4 * (block & ((1 << bwl) - 1)), y = 4 * (block >> bwl);
+  const int use_second_ref = xd->mode_info_context->mbmi.ref_frame[1] > 0;
+  int which_mv;
 
-  mv->col = (2 * mv->col < (xd->mb_to_left_edge - ((16 + extend) << 3))) ?
-            (xd->mb_to_left_edge - (16 << 3)) >> 1 : mv->col;
-  mv->col = (2 * mv->col > xd->mb_to_right_edge + ((15 + extend) << 3)) ?
-            (xd->mb_to_right_edge + (16 << 3)) >> 1 : mv->col;
+  assert(x < bw);
+  assert(y < bh);
+  assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 ||
+         4 << pred_w == bw);
+  assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 ||
+         4 << pred_h == bh);
 
-  mv->row = (2 * mv->row < (xd->mb_to_top_edge - ((16 + extend) << 3))) ?
-            (xd->mb_to_top_edge - (16 << 3)) >> 1 : mv->row;
-  mv->row = (2 * mv->row > xd->mb_to_bottom_edge + ((15 + extend) << 3)) ?
-            (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row;
-}
+  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
+    // source
+    const uint8_t * const base_pre = arg->pre[which_mv][plane];
+    const int pre_stride = arg->pre_stride[which_mv][plane];
+    const uint8_t *const pre = base_pre +
+        scaled_buffer_offset(x, y, pre_stride, &xd->scale_factor[which_mv]);
+    struct scale_factors * const scale =
+      plane == 0 ? &xd->scale_factor[which_mv] : &xd->scale_factor_uv[which_mv];
 
-#define AVERAGE_WEIGHT  (1 << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT))
+    // dest
+    uint8_t *const dst = arg->dst[plane] + arg->dst_stride[plane] * y + x;
 
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
+    // motion vector
+    const MV *mv;
+    MV split_chroma_mv;
+    int_mv clamped_mv;
 
-// Whether to use implicit weighting for UV
-#define USE_IMPLICIT_WEIGHT_UV
-
-// Whether to use implicit weighting for SplitMV
-// #define USE_IMPLICIT_WEIGHT_SPLITMV
-
-// #define SEARCH_MIN3
-static int64_t get_consistency_metric(MACROBLOCKD *xd,
-                                      uint8_t *tmp_y, int tmp_ystride) {
-  int block_size = 16 <<  xd->mode_info_context->mbmi.sb_type;
-  uint8_t *rec_y = xd->dst.y_buffer;
-  int rec_ystride = xd->dst.y_stride;
-  int64_t metric = 0;
-  int i;
-  if (xd->up_available) {
-    for (i = 0; i < block_size; ++i) {
-      int diff = abs(*(rec_y - rec_ystride + i) -
-                     *(tmp_y + i));
-#ifdef SEARCH_MIN3
-      // Searches for the min abs diff among 3 pixel neighbors in the border
-      int diff1 = xd->left_available ?
-          abs(*(rec_y - rec_ystride + i - 1) - *(tmp_y + i)) : diff;
-      int diff2 = i < block_size - 1 ?
-          abs(*(rec_y - rec_ystride + i + 1) - *(tmp_y + i)) : diff;
-      diff = diff <= diff1 ? diff : diff1;
-      diff = diff <= diff2 ? diff : diff2;
-#endif
-      metric += diff;
-    }
-  }
-  if (xd->left_available) {
-    for (i = 0; i < block_size; ++i) {
-      int diff = abs(*(rec_y - 1 + i * rec_ystride) -
-                     *(tmp_y + i * tmp_ystride));
-#ifdef SEARCH_MIN3
-      // Searches for the min abs diff among 3 pixel neighbors in the border
-      int diff1 = xd->up_available ?
-          abs(*(rec_y - 1 + (i - 1) * rec_ystride) -
-                      *(tmp_y + i * tmp_ystride)) : diff;
-      int diff2 = i < block_size - 1 ?
-          abs(*(rec_y - 1 + (i + 1) * rec_ystride) -
-              *(tmp_y + i * tmp_ystride)) : diff;
-      diff = diff <= diff1 ? diff : diff1;
-      diff = diff <= diff2 ? diff : diff2;
-#endif
-      metric += diff;
-    }
-  }
-  return metric;
-}
-
-static int get_weight(MACROBLOCKD *xd, int64_t metric_1, int64_t metric_2) {
-  int weight = AVERAGE_WEIGHT;
-  if (2 * metric_1 < metric_2)
-    weight = 6;
-  else if (4 * metric_1 < 3 * metric_2)
-    weight = 5;
-  else if (2 * metric_2 < metric_1)
-    weight = 2;
-  else if (4 * metric_2 < 3 * metric_1)
-    weight = 3;
-  return weight;
-}
-
-#ifdef USE_IMPLICIT_WEIGHT_SPLITMV
-static int get_implicit_compoundinter_weight_splitmv(
-    MACROBLOCKD *xd, int mb_row, int mb_col) {
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  BLOCKD *blockd = xd->block;
-  const int use_second_ref = mbmi->second_ref_frame > 0;
-  int64_t metric_2 = 0, metric_1 = 0;
-  int i, which_mv, weight;
-  uint8_t tmp_y[256];
-  const int tmp_ystride = 16;
-
-  if (!use_second_ref) return 0;
-  if (!(xd->up_available || xd->left_available))
-    return AVERAGE_WEIGHT;
-
-  assert(xd->mode_info_context->mbmi.mode == SPLITMV);
-
-  which_mv = 1;  // second predictor
-  if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {
-    for (i = 0; i < 16; i += 8) {
-      BLOCKD *d0 = &blockd[i];
-      BLOCKD *d1 = &blockd[i + 2];
-      const int y = i & 8;
-
-      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
-      blockd[i + 2].bmi = xd->mode_info_context->bmi[i + 2];
-
-      if (mbmi->need_to_clamp_mvs) {
-        clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[which_mv].as_mv, xd);
-        clamp_mv_to_umv_border(&blockd[i + 2].bmi.as_mv[which_mv].as_mv, xd);
-      }
-      if (i == 0) {
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 8, 16,
-                                     which_mv, 0, 16, 1,
-                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 8, 16,
-                                     which_mv, 0, 1, 8,
-                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);
+    if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+      if (plane == 0) {
+        mv = &xd->mode_info_context->bmi[block].as_mv[which_mv].as_mv;
       } else {
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + 8 * 16,
-                                     8, 16, which_mv, 0, 1, 8,
-                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);
+        // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
+        // same MV (the average of the 4 luma MVs) but we could do something
+        // smarter for non-4:2:0. Just punt for now, pending the changes to get
+        // rid of SPLITMV mode entirely.
+        split_chroma_mv.row = mi_mv_pred_row_q4(xd, which_mv);
+        split_chroma_mv.col = mi_mv_pred_col_q4(xd, which_mv);
+        mv = &split_chroma_mv;
       }
+    } else {
+      mv = &xd->mode_info_context->mbmi.mv[which_mv].as_mv;
     }
-  } else {
-    for (i = 0; i < 16; i += 2) {
-      BLOCKD *d0 = &blockd[i];
-      BLOCKD *d1 = &blockd[i + 1];
-      const int x = (i & 3) * 4;
-      const int y = (i >> 2) * 4;
 
-      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
-      blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];
+    /* TODO(jkoleszar): This clamping is done in the incorrect place for the
+     * scaling case. It needs to be done on the scaled MV, not the pre-scaling
+     * MV. Note however that it performs the subsampling aware scaling so
+     * that the result is always q4.
+     */
+    clamped_mv.as_mv = clamp_mv_to_umv_border_sb(mv, bwl, bhl,
+                                                 xd->plane[plane].subsampling_x,
+                                                 xd->plane[plane].subsampling_y,
+                                                 xd->mb_to_left_edge,
+                                                 xd->mb_to_top_edge,
+                                                 xd->mb_to_right_edge,
+                                                 xd->mb_to_bottom_edge);
+    scale->set_scaled_offsets(scale, arg->y + y, arg->x + x);
 
-      if (i >= 4 && (i & 3) != 0) continue;
-
-      if (i == 0) {
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 4, 16,
-                                     which_mv, 0, 8, 1, &xd->subpix,
-                                     mb_row * 16 + y, mb_col * 16 + x);
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 4, 16,
-                                     which_mv, 0, 1, 4, &xd->subpix,
-                                     mb_row * 16 + y, mb_col * 16 + x);
-      } else if (i < 4) {
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + x, 4, 16,
-                                     which_mv, 0, 8, 1, &xd->subpix,
-                                     mb_row * 16 + y, mb_col * 16 + x);
-      } else {
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + y * 16,
-                                     4, 16, which_mv, 0, 1, 4, &xd->subpix,
-                                     mb_row * 16 + y, mb_col * 16 + x);
-      }
-    }
+    vp9_build_inter_predictor_q4(pre, pre_stride,
+                                 dst, arg->dst_stride[plane],
+                                 &clamped_mv, &xd->scale_factor[which_mv],
+                                 4 << pred_w, 4 << pred_h, which_mv,
+                                 &xd->subpix);
   }
-  metric_2 = get_consistency_metric(xd, tmp_y, tmp_ystride);
-
-  which_mv = 0;  // first predictor
-  if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {
-    for (i = 0; i < 16; i += 8) {
-      BLOCKD *d0 = &blockd[i];
-      BLOCKD *d1 = &blockd[i + 2];
-      const int y = i & 8;
-
-      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
-      blockd[i + 2].bmi = xd->mode_info_context->bmi[i + 2];
-
-      if (mbmi->need_to_clamp_mvs) {
-        clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[which_mv].as_mv, xd);
-        clamp_mv_to_umv_border(&blockd[i + 2].bmi.as_mv[which_mv].as_mv, xd);
-      }
-      if (i == 0) {
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 8, 16,
-                                     which_mv, 0, 16, 1,
-                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 8, 16,
-                                     which_mv, 0, 1, 8,
-                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);
-      } else {
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + 8 * 16,
-                                     8, 16, which_mv, 0, 1, 8,
-                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);
-      }
-    }
-  } else {
-    for (i = 0; i < 16; i += 2) {
-      BLOCKD *d0 = &blockd[i];
-      BLOCKD *d1 = &blockd[i + 1];
-      const int x = (i & 3) * 4;
-      const int y = (i >> 2) * 4;
-
-      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
-      blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];
-
-      if (i >= 4 && (i & 3) != 0) continue;
-
-      if (i == 0) {
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 4, 16,
-                                     which_mv, 0, 8, 1, &xd->subpix,
-                                     mb_row * 16 + y, mb_col * 16 + x);
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 4, 16,
-                                     which_mv, 0, 1, 4, &xd->subpix,
-                                     mb_row * 16 + y, mb_col * 16 + x);
-      } else if (i < 4) {
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + x, 4, 16,
-                                     which_mv, 0, 8, 1, &xd->subpix,
-                                     mb_row * 16 + y, mb_col * 16 + x);
-      } else {
-        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + y * 16,
-                                     4, 16, which_mv, 0, 1, 4, &xd->subpix,
-                                     mb_row * 16 + y, mb_col * 16 + x);
-      }
-    }
-  }
-  metric_1 = get_consistency_metric(xd, tmp_y, tmp_ystride);
-
-  // Choose final weight for averaging
-  weight = get_weight(xd, metric_1, metric_2);
-  return weight;
 }
-#endif
+void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,
+                                    int mi_row,
+                                    int mi_col,
+                                    BLOCK_SIZE_TYPE bsize) {
+  struct build_inter_predictors_args args = {
+    xd, mi_col * MI_SIZE, mi_row * MI_SIZE,
+    {xd->plane[0].dst.buf, NULL, NULL}, {xd->plane[0].dst.stride, 0, 0},
+    {{xd->plane[0].pre[0].buf, NULL, NULL},
+     {xd->plane[0].pre[1].buf, NULL, NULL}},
+    {{xd->plane[0].pre[0].stride, 0, 0}, {xd->plane[0].pre[1].stride, 0, 0}},
+  };
 
-static int get_implicit_compoundinter_weight(MACROBLOCKD *xd,
-                                             int mb_row,
-                                             int mb_col) {
-  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;
-  int64_t metric_2 = 0, metric_1 = 0;
-  int n, clamp_mvs, pre_stride;
-  uint8_t *base_pre;
-  int_mv ymv;
-  uint8_t tmp_y[4096];
-  const int tmp_ystride = 64;
-  int weight;
-  int edge[4];
-  int block_size = 16 <<  xd->mode_info_context->mbmi.sb_type;
-
-  if (!use_second_ref) return 0;
-  if (!(xd->up_available || xd->left_available))
-    return AVERAGE_WEIGHT;
-
-  edge[0] = xd->mb_to_top_edge;
-  edge[1] = xd->mb_to_bottom_edge;
-  edge[2] = xd->mb_to_left_edge;
-  edge[3] = xd->mb_to_right_edge;
-
-  clamp_mvs = xd->mode_info_context->mbmi.need_to_clamp_secondmv;
-  base_pre = xd->second_pre.y_buffer;
-  pre_stride = xd->second_pre.y_stride;
-  ymv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;
-  // First generate the second predictor
-  for (n = 0; n < block_size; n += 16) {
-    xd->mb_to_left_edge   = edge[2] - (n << 3);
-    xd->mb_to_right_edge  = edge[3] + ((16 - n) << 3);
-    if (clamp_mvs)
-      clamp_mv_to_umv_border(&ymv.as_mv, xd);
-    set_scaled_offsets(&xd->scale_factor[1], mb_row * 16, mb_col * 16 + n);
-    // predict a single row of pixels
-    vp9_build_inter_predictor(
-        base_pre + scaled_buffer_offset(n, 0, pre_stride, &xd->scale_factor[1]),
-        pre_stride, tmp_y + n, tmp_ystride, &ymv, &xd->scale_factor[1],
-        16, 1, 0, &xd->subpix);
-  }
-  xd->mb_to_left_edge = edge[2];
-  xd->mb_to_right_edge = edge[3];
-  for (n = 0; n < block_size; n += 16) {
-    xd->mb_to_top_edge    = edge[0] - (n << 3);
-    xd->mb_to_bottom_edge = edge[1] + ((16 - n) << 3);
-    if (clamp_mvs)
-      clamp_mv_to_umv_border(&ymv.as_mv, xd);
-    set_scaled_offsets(&xd->scale_factor[1], mb_row * 16 + n, mb_col * 16);
-    // predict a single col of pixels
-    vp9_build_inter_predictor(
-        base_pre + scaled_buffer_offset(0, n, pre_stride, &xd->scale_factor[1]),
-        pre_stride, tmp_y + n * tmp_ystride, tmp_ystride, &ymv,
-        &xd->scale_factor[1], 1, 16, 0, &xd->subpix);
-  }
-  xd->mb_to_top_edge = edge[0];
-  xd->mb_to_bottom_edge = edge[1];
-  // Compute consistency metric
-  metric_2 = get_consistency_metric(xd, tmp_y, tmp_ystride);
-
-  clamp_mvs = xd->mode_info_context->mbmi.need_to_clamp_mvs;
-  base_pre = xd->pre.y_buffer;
-  pre_stride = xd->pre.y_stride;
-  ymv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;
-  // Now generate the first predictor
-  for (n = 0; n < block_size; n += 16) {
-    xd->mb_to_left_edge   = edge[2] - (n << 3);
-    xd->mb_to_right_edge  = edge[3] + ((16 - n) << 3);
-    if (clamp_mvs)
-      clamp_mv_to_umv_border(&ymv.as_mv, xd);
-    set_scaled_offsets(&xd->scale_factor[0], mb_row * 16, mb_col * 16 + n);
-    // predict a single row of pixels
-    vp9_build_inter_predictor(
-        base_pre + scaled_buffer_offset(n, 0, pre_stride, &xd->scale_factor[0]),
-        pre_stride, tmp_y + n, tmp_ystride, &ymv, &xd->scale_factor[0],
-        16, 1, 0, &xd->subpix);
-  }
-  xd->mb_to_left_edge = edge[2];
-  xd->mb_to_right_edge = edge[3];
-  for (n = 0; n < block_size; n += 16) {
-    xd->mb_to_top_edge    = edge[0] - (n << 3);
-    xd->mb_to_bottom_edge = edge[1] + ((16 - n) << 3);
-    if (clamp_mvs)
-      clamp_mv_to_umv_border(&ymv.as_mv, xd);
-    set_scaled_offsets(&xd->scale_factor[0], mb_row * 16 + n, mb_col * 16);
-    // predict a single col of pixels
-    vp9_build_inter_predictor(
-        base_pre + scaled_buffer_offset(0, n, pre_stride, &xd->scale_factor[0]),
-        pre_stride, tmp_y + n * tmp_ystride, tmp_ystride, &ymv,
-        &xd->scale_factor[0], 1, 16, 0, &xd->subpix);
-  }
-  xd->mb_to_top_edge = edge[0];
-  xd->mb_to_bottom_edge = edge[1];
-  metric_1 = get_consistency_metric(xd, tmp_y, tmp_ystride);
-
-  // Choose final weight for averaging
-  weight = get_weight(xd, metric_1, metric_2);
-  return weight;
+  foreach_predicted_block_in_plane(xd, bsize, 0, build_inter_predictors, &args);
 }
-
-static void build_inter16x16_predictors_mby_w(MACROBLOCKD *xd,
-                                              uint8_t *dst_y,
-                                              int dst_ystride,
-                                              int weight,
-                                              int mb_row,
-                                              int mb_col) {
-  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;
-  int which_mv;
-
-  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-    const int clamp_mvs = which_mv ?
-        xd->mode_info_context->mbmi.need_to_clamp_secondmv :
-         xd->mode_info_context->mbmi.need_to_clamp_mvs;
-
-    uint8_t *base_pre = which_mv ? xd->second_pre.y_buffer : xd->pre.y_buffer;
-    int pre_stride = which_mv ? xd->second_pre.y_stride : xd->pre.y_stride;
-    int_mv ymv;
-    ymv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int;
-
-    if (clamp_mvs)
-      clamp_mv_to_umv_border(&ymv.as_mv, xd);
-
-    set_scaled_offsets(&xd->scale_factor[which_mv], mb_row * 16, mb_col * 16);
-
-    vp9_build_inter_predictor(base_pre, pre_stride,
-                              dst_y, dst_ystride,
-                              &ymv, &xd->scale_factor[which_mv],
-                              16, 16, which_mv ? weight : 0, &xd->subpix);
-  }
-}
-
-void vp9_build_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                         uint8_t *dst_y,
-                                         int dst_ystride,
-                                         int mb_row,
-                                         int mb_col) {
-  int weight = get_implicit_compoundinter_weight(xd, mb_row, mb_col);
-
-  build_inter16x16_predictors_mby_w(xd, dst_y, dst_ystride, weight,
-                                    mb_row, mb_col);
-}
-
-#else
-
-void vp9_build_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                         uint8_t *dst_y,
-                                         int dst_ystride,
-                                         int mb_row,
-                                         int mb_col) {
-  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;
-  int which_mv;
-
-  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-    const int clamp_mvs = which_mv ?
-         xd->mode_info_context->mbmi.need_to_clamp_secondmv :
-         xd->mode_info_context->mbmi.need_to_clamp_mvs;
-
-    uint8_t *base_pre = which_mv ? xd->second_pre.y_buffer : xd->pre.y_buffer;
-    int pre_stride = which_mv ? xd->second_pre.y_stride : xd->pre.y_stride;
-    int_mv ymv;
-    ymv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int;
-
-    if (clamp_mvs)
-      clamp_mv_to_umv_border(&ymv.as_mv, xd);
-
-    set_scaled_offsets(&xd->scale_factor[which_mv], mb_row * 16, mb_col * 16);
-
-    vp9_build_inter_predictor(base_pre, pre_stride,
-                              dst_y, dst_ystride,
-                              &ymv, &xd->scale_factor[which_mv],
-                              16, 16, which_mv, &xd->subpix);
-  }
-}
-#endif
-
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-static void build_inter16x16_predictors_mbuv_w(MACROBLOCKD *xd,
-                                               uint8_t *dst_u,
-                                               uint8_t *dst_v,
-                                               int dst_uvstride,
-                                               int weight,
-                                               int mb_row,
-                                               int mb_col) {
-  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;
-  int which_mv;
-
-  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-    const int clamp_mvs =
-        which_mv ? xd->mode_info_context->mbmi.need_to_clamp_secondmv
-                 : xd->mode_info_context->mbmi.need_to_clamp_mvs;
-    uint8_t *uptr, *vptr;
-    int pre_stride = which_mv ? xd->second_pre.uv_stride
-                              : xd->pre.uv_stride;
-    int_mv _o16x16mv;
-    int_mv _16x16mv;
-
-    _16x16mv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int;
-
-    if (clamp_mvs)
-      clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
-
-    _o16x16mv = _16x16mv;
-    /* calc uv motion vectors */
-    if (_16x16mv.as_mv.row < 0)
-      _16x16mv.as_mv.row -= 1;
-    else
-      _16x16mv.as_mv.row += 1;
-
-    if (_16x16mv.as_mv.col < 0)
-      _16x16mv.as_mv.col -= 1;
-    else
-      _16x16mv.as_mv.col += 1;
-
-    _16x16mv.as_mv.row /= 2;
-    _16x16mv.as_mv.col /= 2;
-
-    _16x16mv.as_mv.row &= xd->fullpixel_mask;
-    _16x16mv.as_mv.col &= xd->fullpixel_mask;
-
-    uptr = (which_mv ? xd->second_pre.u_buffer : xd->pre.u_buffer);
-    vptr = (which_mv ? xd->second_pre.v_buffer : xd->pre.v_buffer);
-
-    set_scaled_offsets(&xd->scale_factor_uv[which_mv],
-                       mb_row * 16, mb_col * 16);
-
-    vp9_build_inter_predictor_q4(
-        uptr, pre_stride, dst_u, dst_uvstride, &_16x16mv, &_o16x16mv,
-        &xd->scale_factor_uv[which_mv], 8, 8,
-        which_mv ? weight : 0, &xd->subpix);
-
-    vp9_build_inter_predictor_q4(
-        vptr, pre_stride, dst_v, dst_uvstride, &_16x16mv, &_o16x16mv,
-        &xd->scale_factor_uv[which_mv], 8, 8,
-        which_mv ? weight : 0, &xd->subpix);
-  }
-}
-
-void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                          uint8_t *dst_u,
-                                          uint8_t *dst_v,
-                                          int dst_uvstride,
-                                          int mb_row,
-                                          int mb_col) {
-#ifdef USE_IMPLICIT_WEIGHT_UV
-  int weight = get_implicit_compoundinter_weight(xd, mb_row, mb_col);
+void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd,
+                                     int mi_row,
+                                     int mi_col,
+                                     BLOCK_SIZE_TYPE bsize) {
+  struct build_inter_predictors_args args = {
+    xd, mi_col * MI_SIZE, mi_row * MI_SIZE,
+#if CONFIG_ALPHA
+    {NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
+     xd->plane[3].dst.buf},
+    {0, xd->plane[1].dst.stride, xd->plane[1].dst.stride,
+     xd->plane[3].dst.stride},
+    {{NULL, xd->plane[1].pre[0].buf, xd->plane[2].pre[0].buf,
+      xd->plane[3].pre[0].buf},
+     {NULL, xd->plane[1].pre[1].buf, xd->plane[2].pre[1].buf,
+      xd->plane[3].pre[1].buf}},
+    {{0, xd->plane[1].pre[0].stride, xd->plane[1].pre[0].stride,
+      xd->plane[3].pre[0].stride},
+     {0, xd->plane[1].pre[1].stride, xd->plane[1].pre[1].stride,
+      xd->plane[3].pre[1].stride}},
 #else
-  int weight = AVERAGE_WEIGHT;
+    {NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf},
+    {0, xd->plane[1].dst.stride, xd->plane[1].dst.stride},
+    {{NULL, xd->plane[1].pre[0].buf, xd->plane[2].pre[0].buf},
+     {NULL, xd->plane[1].pre[1].buf, xd->plane[2].pre[1].buf}},
+    {{0, xd->plane[1].pre[0].stride, xd->plane[1].pre[0].stride},
+     {0, xd->plane[1].pre[1].stride, xd->plane[1].pre[1].stride}},
 #endif
-  build_inter16x16_predictors_mbuv_w(xd, dst_u, dst_v, dst_uvstride,
-                                     weight, mb_row, mb_col);
+  };
+  foreach_predicted_block_uv(xd, bsize, build_inter_predictors, &args);
 }
+void vp9_build_inter_predictors_sb(MACROBLOCKD *xd,
+                                   int mi_row, int mi_col,
+                                   BLOCK_SIZE_TYPE bsize) {
 
-#else
-
-void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                          uint8_t *dst_u,
-                                          uint8_t *dst_v,
-                                          int dst_uvstride,
-                                          int mb_row,
-                                          int mb_col) {
-  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;
-  int which_mv;
-
-  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-    const int clamp_mvs =
-        which_mv ? xd->mode_info_context->mbmi.need_to_clamp_secondmv
-                 : xd->mode_info_context->mbmi.need_to_clamp_mvs;
-    uint8_t *uptr, *vptr;
-    int pre_stride = which_mv ? xd->second_pre.uv_stride
-                              : xd->pre.uv_stride;
-    int_mv _o16x16mv;
-    int_mv _16x16mv;
-
-    _16x16mv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int;
-
-    if (clamp_mvs)
-      clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);
-
-    _o16x16mv = _16x16mv;
-    /* calc uv motion vectors */
-    if (_16x16mv.as_mv.row < 0)
-      _16x16mv.as_mv.row -= 1;
-    else
-      _16x16mv.as_mv.row += 1;
-
-    if (_16x16mv.as_mv.col < 0)
-      _16x16mv.as_mv.col -= 1;
-    else
-      _16x16mv.as_mv.col += 1;
-
-    _16x16mv.as_mv.row /= 2;
-    _16x16mv.as_mv.col /= 2;
-
-    _16x16mv.as_mv.row &= xd->fullpixel_mask;
-    _16x16mv.as_mv.col &= xd->fullpixel_mask;
-
-    uptr = (which_mv ? xd->second_pre.u_buffer : xd->pre.u_buffer);
-    vptr = (which_mv ? xd->second_pre.v_buffer : xd->pre.v_buffer);
-
-    set_scaled_offsets(&xd->scale_factor_uv[which_mv],
-                       mb_row * 16, mb_col * 16);
-
-    vp9_build_inter_predictor_q4(
-        uptr, pre_stride, dst_u, dst_uvstride, &_16x16mv, &_o16x16mv,
-        &xd->scale_factor_uv[which_mv], 8, 8,
-        which_mv << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT), &xd->subpix);
-
-    vp9_build_inter_predictor_q4(
-        vptr, pre_stride, dst_v, dst_uvstride, &_16x16mv, &_o16x16mv,
-        &xd->scale_factor_uv[which_mv], 8, 8,
-        which_mv << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT), &xd->subpix);
-  }
+  vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+  vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize);
 }
-#endif
 
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-static void build_inter32x32_predictors_sby_w(MACROBLOCKD *x,
-                                              uint8_t *dst_y,
-                                              int dst_ystride,
-                                              int weight,
-                                              int mb_row,
-                                              int mb_col) {
-  uint8_t *y1 = x->pre.y_buffer;
-  uint8_t *y2 = x->second_pre.y_buffer;
-  int edge[4], n;
-
-  edge[0] = x->mb_to_top_edge;
-  edge[1] = x->mb_to_bottom_edge;
-  edge[2] = x->mb_to_left_edge;
-  edge[3] = x->mb_to_right_edge;
-
-  for (n = 0; n < 4; n++) {
-    const int x_idx = n & 1, y_idx = n >> 1;
-
-    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 16) << 3);
-    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 16) << 3);
-    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 16) << 3);
-    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 16) << 3);
-
-    x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 16,
-                                                y_idx * 16,
-                                                x->pre.y_stride,
-                                                &x->scale_factor[0]);
-    if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-      x->second_pre.y_buffer = y2 +
-          scaled_buffer_offset(x_idx * 16,
-                               y_idx * 16,
-                               x->second_pre.y_stride,
-                               &x->scale_factor[1]);
-    }
-    build_inter16x16_predictors_mby_w(x,
-        dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,
-        dst_ystride, weight, mb_row + y_idx, mb_col + x_idx);
-  }
-  x->mb_to_top_edge    = edge[0];
-  x->mb_to_bottom_edge = edge[1];
-  x->mb_to_left_edge   = edge[2];
-  x->mb_to_right_edge  = edge[3];
-
-  x->pre.y_buffer = y1;
-  if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-    x->second_pre.y_buffer = y2;
-  }
-}
-
-void vp9_build_inter32x32_predictors_sby(MACROBLOCKD *x,
-                                         uint8_t *dst_y,
-                                         int dst_ystride,
-                                         int mb_row,
-                                         int mb_col) {
-  int weight = get_implicit_compoundinter_weight(x, mb_row, mb_col);
-  build_inter32x32_predictors_sby_w(x, dst_y, dst_ystride, weight,
-                                    mb_row, mb_col);
-}
-
-#else
-
-// TODO(all): Can we use 32x32 specific implementations of this rather than
-// using 16x16 implementations ?
-void vp9_build_inter32x32_predictors_sby(MACROBLOCKD *x,
-                                         uint8_t *dst_y,
-                                         int dst_ystride,
-                                         int mb_row,
-                                         int mb_col) {
-  uint8_t *y1 = x->pre.y_buffer;
-  uint8_t *y2 = x->second_pre.y_buffer;
-  int edge[4], n;
-
-  edge[0] = x->mb_to_top_edge;
-  edge[1] = x->mb_to_bottom_edge;
-  edge[2] = x->mb_to_left_edge;
-  edge[3] = x->mb_to_right_edge;
-
-  for (n = 0; n < 4; n++) {
-    const int x_idx = n & 1, y_idx = n >> 1;
-
-    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 16) << 3);
-    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 16) << 3);
-    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 16) << 3);
-    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 16) << 3);
-
-    x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 16,
-                                                y_idx * 16,
-                                                x->pre.y_stride,
-                                                &x->scale_factor[0]);
-    if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-      x->second_pre.y_buffer = y2 +
-          scaled_buffer_offset(x_idx * 16,
-                               y_idx * 16,
-                               x->second_pre.y_stride,
-                               &x->scale_factor[1]);
-    }
-    vp9_build_inter16x16_predictors_mby(x,
-        dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,
-        dst_ystride, mb_row + y_idx, mb_col + x_idx);
-  }
-  x->mb_to_top_edge    = edge[0];
-  x->mb_to_bottom_edge = edge[1];
-  x->mb_to_left_edge   = edge[2];
-  x->mb_to_right_edge  = edge[3];
-
-  x->pre.y_buffer = y1;
-  if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-    x->second_pre.y_buffer = y2;
-  }
-}
-
-#endif
-
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-static void build_inter32x32_predictors_sbuv_w(MACROBLOCKD *x,
-                                               uint8_t *dst_u,
-                                               uint8_t *dst_v,
-                                               int dst_uvstride,
-                                               int weight,
-                                               int mb_row,
-                                               int mb_col) {
-  uint8_t *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
-  uint8_t *u2 = x->second_pre.u_buffer, *v2 = x->second_pre.v_buffer;
-  int edge[4], n;
-
-  edge[0] = x->mb_to_top_edge;
-  edge[1] = x->mb_to_bottom_edge;
-  edge[2] = x->mb_to_left_edge;
-  edge[3] = x->mb_to_right_edge;
-
-  for (n = 0; n < 4; n++) {
-    int scaled_uv_offset;
-    const int x_idx = n & 1, y_idx = n >> 1;
-
-    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 16) << 3);
-    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 16) << 3);
-    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 16) << 3);
-    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 16) << 3);
-
-    scaled_uv_offset = scaled_buffer_offset(x_idx * 8,
-                                            y_idx * 8,
-                                            x->pre.uv_stride,
-                                            &x->scale_factor_uv[0]);
-    x->pre.u_buffer = u1 + scaled_uv_offset;
-    x->pre.v_buffer = v1 + scaled_uv_offset;
-
-    if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-      scaled_uv_offset = scaled_buffer_offset(x_idx * 8,
-                                              y_idx * 8,
-                                              x->second_pre.uv_stride,
-                                              &x->scale_factor_uv[1]);
-      x->second_pre.u_buffer = u2 + scaled_uv_offset;
-      x->second_pre.v_buffer = v2 + scaled_uv_offset;
-    }
-
-    build_inter16x16_predictors_mbuv_w(x,
-        dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,
-        dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,
-        dst_uvstride, weight, mb_row + y_idx, mb_col + x_idx);
-  }
-  x->mb_to_top_edge    = edge[0];
-  x->mb_to_bottom_edge = edge[1];
-  x->mb_to_left_edge   = edge[2];
-  x->mb_to_right_edge  = edge[3];
-
-  x->pre.u_buffer = u1;
-  x->pre.v_buffer = v1;
-
-  if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-    x->second_pre.u_buffer = u2;
-    x->second_pre.v_buffer = v2;
-  }
-}
-
-void vp9_build_inter32x32_predictors_sbuv(MACROBLOCKD *xd,
-                                          uint8_t *dst_u,
-                                          uint8_t *dst_v,
-                                          int dst_uvstride,
-                                          int mb_row,
-                                          int mb_col) {
-#ifdef USE_IMPLICIT_WEIGHT_UV
-  int weight = get_implicit_compoundinter_weight(xd, mb_row, mb_col);
-#else
-  int weight = AVERAGE_WEIGHT;
-#endif
-  build_inter32x32_predictors_sbuv_w(xd, dst_u, dst_v, dst_uvstride,
-                                     weight, mb_row, mb_col);
-}
-
-#else
-
-void vp9_build_inter32x32_predictors_sbuv(MACROBLOCKD *x,
-                                          uint8_t *dst_u,
-                                          uint8_t *dst_v,
-                                          int dst_uvstride,
-                                          int mb_row,
-                                          int mb_col) {
-  uint8_t *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
-  uint8_t *u2 = x->second_pre.u_buffer, *v2 = x->second_pre.v_buffer;
-  int edge[4], n;
-
-  edge[0] = x->mb_to_top_edge;
-  edge[1] = x->mb_to_bottom_edge;
-  edge[2] = x->mb_to_left_edge;
-  edge[3] = x->mb_to_right_edge;
-
-  for (n = 0; n < 4; n++) {
-    int scaled_uv_offset;
-    const int x_idx = n & 1, y_idx = n >> 1;
-
-    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 16) << 3);
-    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 16) << 3);
-    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 16) << 3);
-    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 16) << 3);
-
-    scaled_uv_offset = scaled_buffer_offset(x_idx * 8,
-                                            y_idx * 8,
-                                            x->pre.uv_stride,
-                                            &x->scale_factor_uv[0]);
-    x->pre.u_buffer = u1 + scaled_uv_offset;
-    x->pre.v_buffer = v1 + scaled_uv_offset;
-
-    if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-      scaled_uv_offset = scaled_buffer_offset(x_idx * 8,
-                                              y_idx * 8,
-                                              x->second_pre.uv_stride,
-                                              &x->scale_factor_uv[1]);
-      x->second_pre.u_buffer = u2 + scaled_uv_offset;
-      x->second_pre.v_buffer = v2 + scaled_uv_offset;
-    }
-
-    vp9_build_inter16x16_predictors_mbuv(x,
-        dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,
-        dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,
-        dst_uvstride, mb_row + y_idx, mb_col + x_idx);
-  }
-  x->mb_to_top_edge    = edge[0];
-  x->mb_to_bottom_edge = edge[1];
-  x->mb_to_left_edge   = edge[2];
-  x->mb_to_right_edge  = edge[3];
-
-  x->pre.u_buffer = u1;
-  x->pre.v_buffer = v1;
-
-  if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-    x->second_pre.u_buffer = u2;
-    x->second_pre.v_buffer = v2;
-  }
-}
-#endif
-
-void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
-                                        uint8_t *dst_y,
-                                        uint8_t *dst_u,
-                                        uint8_t *dst_v,
-                                        int dst_ystride,
-                                        int dst_uvstride,
-                                        int mb_row,
-                                        int mb_col) {
-  vp9_build_inter32x32_predictors_sby(x, dst_y, dst_ystride,
-                                      mb_row, mb_col);
-  vp9_build_inter32x32_predictors_sbuv(x, dst_u, dst_v, dst_uvstride,
-                                      mb_row, mb_col);
-#if CONFIG_COMP_INTERINTRA_PRED
-  if (x->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
-    vp9_build_interintra_32x32_predictors_sb(
-        x, dst_y, dst_u, dst_v, dst_ystride, dst_uvstride);
-  }
-#endif
-}
-
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-static void build_inter64x64_predictors_sby_w(MACROBLOCKD *x,
-                                              uint8_t *dst_y,
-                                              int dst_ystride,
-                                              int weight,
-                                              int mb_row,
-                                              int mb_col) {
-  uint8_t *y1 = x->pre.y_buffer;
-  uint8_t *y2 = x->second_pre.y_buffer;
-  int edge[4], n;
-
-  edge[0] = x->mb_to_top_edge;
-  edge[1] = x->mb_to_bottom_edge;
-  edge[2] = x->mb_to_left_edge;
-  edge[3] = x->mb_to_right_edge;
-
-  for (n = 0; n < 4; n++) {
-    const int x_idx = n & 1, y_idx = n >> 1;
-
-    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 32) << 3);
-    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 32) << 3);
-    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 32) << 3);
-    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 32) << 3);
-
-    x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 32,
-                                                y_idx * 32,
-                                                x->pre.y_stride,
-                                                &x->scale_factor[0]);
-
-    if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-      x->second_pre.y_buffer = y2 +
-          scaled_buffer_offset(x_idx * 32,
-                               y_idx * 32,
-                               x->second_pre.y_stride,
-                               &x->scale_factor[1]);
-    }
-
-    build_inter32x32_predictors_sby_w(x,
-        dst_y + y_idx * 32 * dst_ystride  + x_idx * 32,
-        dst_ystride, weight, mb_row + y_idx * 2, mb_col + x_idx * 2);
-  }
-
-  x->mb_to_top_edge    = edge[0];
-  x->mb_to_bottom_edge = edge[1];
-  x->mb_to_left_edge   = edge[2];
-  x->mb_to_right_edge  = edge[3];
-
-  x->pre.y_buffer = y1;
-
-  if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-    x->second_pre.y_buffer = y2;
-  }
-}
-
-void vp9_build_inter64x64_predictors_sby(MACROBLOCKD *x,
-                                         uint8_t *dst_y,
-                                         int dst_ystride,
-                                         int mb_row,
-                                         int mb_col) {
-  int weight = get_implicit_compoundinter_weight(x, mb_row, mb_col);
-  build_inter64x64_predictors_sby_w(x, dst_y, dst_ystride, weight,
-                                    mb_row, mb_col);
-}
-
-#else
-
-void vp9_build_inter64x64_predictors_sby(MACROBLOCKD *x,
-                                         uint8_t *dst_y,
-                                         int dst_ystride,
-                                         int mb_row,
-                                         int mb_col) {
-  uint8_t *y1 = x->pre.y_buffer;
-  uint8_t *y2 = x->second_pre.y_buffer;
-  int edge[4], n;
-
-  edge[0] = x->mb_to_top_edge;
-  edge[1] = x->mb_to_bottom_edge;
-  edge[2] = x->mb_to_left_edge;
-  edge[3] = x->mb_to_right_edge;
-
-  for (n = 0; n < 4; n++) {
-    const int x_idx = n & 1, y_idx = n >> 1;
-
-    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 32) << 3);
-    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 32) << 3);
-    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 32) << 3);
-    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 32) << 3);
-
-    x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 32,
-                                                y_idx * 32,
-                                                x->pre.y_stride,
-                                                &x->scale_factor[0]);
-
-    if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-      x->second_pre.y_buffer = y2 +
-          scaled_buffer_offset(x_idx * 32,
-                               y_idx * 32,
-                               x->second_pre.y_stride,
-                               &x->scale_factor[1]);
-    }
-
-    vp9_build_inter32x32_predictors_sby(x,
-        dst_y + y_idx * 32 * dst_ystride  + x_idx * 32,
-        dst_ystride, mb_row + y_idx * 2, mb_col + x_idx * 2);
-  }
-
-  x->mb_to_top_edge    = edge[0];
-  x->mb_to_bottom_edge = edge[1];
-  x->mb_to_left_edge   = edge[2];
-  x->mb_to_right_edge  = edge[3];
-
-  x->pre.y_buffer = y1;
-
-  if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-    x->second_pre.y_buffer = y2;
-  }
-}
-#endif
-
-void vp9_build_inter64x64_predictors_sbuv(MACROBLOCKD *x,
-                                          uint8_t *dst_u,
-                                          uint8_t *dst_v,
-                                          int dst_uvstride,
-                                          int mb_row,
-                                          int mb_col) {
-  uint8_t *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
-  uint8_t *u2 = x->second_pre.u_buffer, *v2 = x->second_pre.v_buffer;
-  int edge[4], n;
-
-  edge[0] = x->mb_to_top_edge;
-  edge[1] = x->mb_to_bottom_edge;
-  edge[2] = x->mb_to_left_edge;
-  edge[3] = x->mb_to_right_edge;
-
-  for (n = 0; n < 4; n++) {
-    const int x_idx = n & 1, y_idx = n >> 1;
-    int scaled_uv_offset;
-
-    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 32) << 3);
-    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 32) << 3);
-    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 32) << 3);
-    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 32) << 3);
-
-    scaled_uv_offset = scaled_buffer_offset(x_idx * 16,
-                                            y_idx * 16,
-                                            x->pre.uv_stride,
-                                            &x->scale_factor_uv[0]);
-    x->pre.u_buffer = u1 + scaled_uv_offset;
-    x->pre.v_buffer = v1 + scaled_uv_offset;
-
-    if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-      scaled_uv_offset = scaled_buffer_offset(x_idx * 16,
-                                              y_idx * 16,
-                                              x->second_pre.uv_stride,
-                                              &x->scale_factor_uv[1]);
-      x->second_pre.u_buffer = u2 + scaled_uv_offset;
-      x->second_pre.v_buffer = v2 + scaled_uv_offset;
-    }
-
-    vp9_build_inter32x32_predictors_sbuv(x,
-        dst_u + y_idx * 16 * dst_uvstride + x_idx * 16,
-        dst_v + y_idx * 16 * dst_uvstride + x_idx * 16,
-        dst_uvstride, mb_row + y_idx * 2, mb_col + x_idx * 2);
-  }
-
-  x->mb_to_top_edge    = edge[0];
-  x->mb_to_bottom_edge = edge[1];
-  x->mb_to_left_edge   = edge[2];
-  x->mb_to_right_edge  = edge[3];
-
-  x->pre.u_buffer = u1;
-  x->pre.v_buffer = v1;
-
-  if (x->mode_info_context->mbmi.second_ref_frame > 0) {
-    x->second_pre.u_buffer = u2;
-    x->second_pre.v_buffer = v2;
-  }
-}
-
-void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
-                                        uint8_t *dst_y,
-                                        uint8_t *dst_u,
-                                        uint8_t *dst_v,
-                                        int dst_ystride,
-                                        int dst_uvstride,
-                                        int mb_row,
-                                        int mb_col) {
-  vp9_build_inter64x64_predictors_sby(x, dst_y, dst_ystride,
-                                      mb_row, mb_col);
-  vp9_build_inter64x64_predictors_sbuv(x, dst_u, dst_v, dst_uvstride,
-                                       mb_row, mb_col);
-#if CONFIG_COMP_INTERINTRA_PRED
-  if (x->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
-    vp9_build_interintra_64x64_predictors_sb(x, dst_y, dst_u, dst_v,
-                                             dst_ystride, dst_uvstride);
-  }
-#endif
-}
-
-static void build_inter4x4_predictors_mb(MACROBLOCKD *xd,
-                                         int mb_row, int mb_col) {
-  int i;
-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-  BLOCKD *blockd = xd->block;
-  int which_mv = 0;
-  const int use_second_ref = mbmi->second_ref_frame > 0;
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT && defined(USE_IMPLICIT_WEIGHT_SPLITMV)
-  int weight = get_implicit_compoundinter_weight_splitmv(xd, mb_row, mb_col);
-#else
-  int weight = AVERAGE_WEIGHT;
-#endif
-
-  if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {
-    for (i = 0; i < 16; i += 8) {
-      BLOCKD *d0 = &blockd[i];
-      BLOCKD *d1 = &blockd[i + 2];
-      const int y = i & 8;
-
-      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
-      blockd[i + 2].bmi = xd->mode_info_context->bmi[i + 2];
-
-      for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-        if (mbmi->need_to_clamp_mvs) {
-          clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[which_mv].as_mv, xd);
-          clamp_mv_to_umv_border(&blockd[i + 2].bmi.as_mv[which_mv].as_mv, xd);
-        }
-
-        build_2x1_inter_predictor(d0, d1, xd->scale_factor, 8, 16, which_mv,
-                                  which_mv ? weight : 0,
-                                  &xd->subpix, mb_row * 16 + y, mb_col * 16);
-      }
-    }
-  } else {
-    for (i = 0; i < 16; i += 2) {
-      BLOCKD *d0 = &blockd[i];
-      BLOCKD *d1 = &blockd[i + 1];
-      const int x = (i & 3) * 4;
-      const int y = (i >> 2) * 4;
-
-      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];
-      blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];
-
-      for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-        build_2x1_inter_predictor(d0, d1, xd->scale_factor, 4, 16, which_mv,
-                                  which_mv ? weight : 0,
-                                  &xd->subpix,
-                                  mb_row * 16 + y, mb_col * 16 + x);
-      }
-    }
-  }
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-#if !defined(USE_IMPLICIT_WEIGHT_UV)
-  weight = AVERAGE_WEIGHT;
-#endif
-#endif
-  for (i = 16; i < 24; i += 2) {
-    BLOCKD *d0 = &blockd[i];
-    BLOCKD *d1 = &blockd[i + 1];
-    const int x = 4 * (i & 1);
-    const int y = ((i - 16) >> 1) * 4;
-
-    for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-      build_2x1_inter_predictor(d0, d1, xd->scale_factor_uv, 4, 8, which_mv,
-                                which_mv ? weight : 0, &xd->subpix,
-                                mb_row * 8 + y, mb_col * 8 + x);
-    }
-  }
-}
-
-static INLINE int round_mv_comp(int value) {
-  return (value < 0 ? value - 4 : value + 4) / 8;
-}
-
-static int mi_mv_pred_row(MACROBLOCKD *mb, int off, int idx) {
-  const int temp = mb->mode_info_context->bmi[off + 0].as_mv[idx].as_mv.row +
-                   mb->mode_info_context->bmi[off + 1].as_mv[idx].as_mv.row +
-                   mb->mode_info_context->bmi[off + 4].as_mv[idx].as_mv.row +
-                   mb->mode_info_context->bmi[off + 5].as_mv[idx].as_mv.row;
-  return round_mv_comp(temp) & mb->fullpixel_mask;
-}
-
-static int mi_mv_pred_col(MACROBLOCKD *mb, int off, int idx) {
-  const int temp = mb->mode_info_context->bmi[off + 0].as_mv[idx].as_mv.col +
-                   mb->mode_info_context->bmi[off + 1].as_mv[idx].as_mv.col +
-                   mb->mode_info_context->bmi[off + 4].as_mv[idx].as_mv.col +
-                   mb->mode_info_context->bmi[off + 5].as_mv[idx].as_mv.col;
-  return round_mv_comp(temp) & mb->fullpixel_mask;
-}
-
-static int b_mv_pred_row(MACROBLOCKD *mb, int off, int idx) {
-  BLOCKD *const blockd = mb->block;
-  const int temp = blockd[off + 0].bmi.as_mv[idx].as_mv.row +
-                   blockd[off + 1].bmi.as_mv[idx].as_mv.row +
-                   blockd[off + 4].bmi.as_mv[idx].as_mv.row +
-                   blockd[off + 5].bmi.as_mv[idx].as_mv.row;
-  return round_mv_comp(temp) & mb->fullpixel_mask;
-}
-
-static int b_mv_pred_col(MACROBLOCKD *mb, int off, int idx) {
-  BLOCKD *const blockd = mb->block;
-  const int temp = blockd[off + 0].bmi.as_mv[idx].as_mv.col +
-                   blockd[off + 1].bmi.as_mv[idx].as_mv.col +
-                   blockd[off + 4].bmi.as_mv[idx].as_mv.col +
-                   blockd[off + 5].bmi.as_mv[idx].as_mv.col;
-  return round_mv_comp(temp) & mb->fullpixel_mask;
-}
-
-
-static void build_4x4uvmvs(MACROBLOCKD *xd) {
-  int i, j;
-  BLOCKD *blockd = xd->block;
-
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 2; j++) {
-      const int yoffset = i * 8 + j * 2;
-      const int uoffset = 16 + i * 2 + j;
-      const int voffset = 20 + i * 2 + j;
-
-      MV *u = &blockd[uoffset].bmi.as_mv[0].as_mv;
-      MV *v = &blockd[voffset].bmi.as_mv[0].as_mv;
-      u->row = mi_mv_pred_row(xd, yoffset, 0);
-      u->col = mi_mv_pred_col(xd, yoffset, 0);
-
-      // if (x->mode_info_context->mbmi.need_to_clamp_mvs)
-      clamp_uvmv_to_umv_border(u, xd);
-
-      // if (x->mode_info_context->mbmi.need_to_clamp_mvs)
-      clamp_uvmv_to_umv_border(u, xd);
-
-      v->row = u->row;
-      v->col = u->col;
-
-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        u = &blockd[uoffset].bmi.as_mv[1].as_mv;
-        v = &blockd[voffset].bmi.as_mv[1].as_mv;
-        u->row = mi_mv_pred_row(xd, yoffset, 1);
-        u->col = mi_mv_pred_col(xd, yoffset, 1);
-
-        // if (mbmi->need_to_clamp_mvs)
-        clamp_uvmv_to_umv_border(u, xd);
-
-        // if (mbmi->need_to_clamp_mvs)
-        clamp_uvmv_to_umv_border(u, xd);
-
-        v->row = u->row;
-        v->col = u->col;
-      }
-    }
-  }
-}
-
-void vp9_build_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                        uint8_t *dst_y,
-                                        uint8_t *dst_u,
-                                        uint8_t *dst_v,
-                                        int dst_ystride,
-                                        int dst_uvstride,
-                                        int mb_row,
-                                        int mb_col) {
-  vp9_build_inter16x16_predictors_mby(xd, dst_y, dst_ystride, mb_row, mb_col);
-  vp9_build_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride,
-                                       mb_row, mb_col);
-#if CONFIG_COMP_INTERINTRA_PRED
-  if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
-    vp9_build_interintra_16x16_predictors_mb(xd, dst_y, dst_u, dst_v,
-                                             dst_ystride, dst_uvstride);
-  }
-#endif
-}
-
-void vp9_build_inter_predictors_mb(MACROBLOCKD *xd,
-                                   int mb_row,
-                                   int mb_col) {
-  if (xd->mode_info_context->mbmi.mode != SPLITMV) {
-    vp9_build_inter16x16_predictors_mb(xd, xd->predictor,
-                                       &xd->predictor[256],
-                                       &xd->predictor[320], 16, 8,
-                                       mb_row, mb_col);
-
-  } else {
-    build_4x4uvmvs(xd);
-    build_inter4x4_predictors_mb(xd, mb_row, mb_col);
-  }
-}
-
 /*encoder only*/
 void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd,
                                         int mb_row, int mb_col) {
-  int i, j, weight;
-  BLOCKD *const blockd = xd->block;
+  vp9_build_inter_predictors_sbuv(xd, mb_row, mb_col,
+                                  BLOCK_SIZE_MB16X16);
+}
 
-  /* build uv mvs */
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 2; j++) {
-      const int yoffset = i * 8 + j * 2;
-      const int uoffset = 16 + i * 2 + j;
-      const int voffset = 20 + i * 2 + j;
-
-      MV *u = &blockd[uoffset].bmi.as_mv[0].as_mv;
-      MV *v = &blockd[voffset].bmi.as_mv[0].as_mv;
-
-      v->row = u->row = b_mv_pred_row(xd, yoffset, 0);
-      v->col = u->col = b_mv_pred_col(xd, yoffset, 0);
-
-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        u = &blockd[uoffset].bmi.as_mv[1].as_mv;
-        v = &blockd[voffset].bmi.as_mv[1].as_mv;
-
-        v->row = u->row = b_mv_pred_row(xd, yoffset, 1);
-        v->row = u->col = b_mv_pred_row(xd, yoffset, 1);
-      }
-    }
+// TODO(dkovalev: find better place for this function)
+void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
+  const int ref = cm->active_ref_idx[i];
+  struct scale_factors *const sf = &cm->active_ref_scale[i];
+  if (ref >= NUM_YV12_BUFFERS) {
+    memset(sf, 0, sizeof(*sf));
+  } else {
+    YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[ref];
+    vp9_setup_scale_factors_for_frame(sf,
+                                      fb->y_crop_width, fb->y_crop_height,
+                                      cm->width, cm->height);
   }
-
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT && \
-  defined(USE_IMPLICIT_WEIGHT_SPLITMV) && \
-  defined(USE_IMPLICIT_WEIGHT_UV)
-  weight = get_implicit_compoundinter_weight_splitmv(xd, mb_row, mb_col);
-#else
-  weight = AVERAGE_WEIGHT;
-#endif
-  for (i = 16; i < 24; i += 2) {
-    const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;
-    const int x = 4 * (i & 1);
-    const int y = ((i - 16) >> 1) * 4;
-
-    int which_mv;
-    BLOCKD *d0 = &blockd[i];
-    BLOCKD *d1 = &blockd[i + 1];
-
-    for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-      build_2x1_inter_predictor(d0, d1, xd->scale_factor_uv, 4, 8, which_mv,
-                                which_mv ? weight : 0,
-                                &xd->subpix, mb_row * 8 + y, mb_col * 8 + x);
-    }
-  }
 }
+
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -15,61 +15,26 @@
 #include "vp9/common/vp9_onyxc_int.h"
 
 struct subpix_fn_table;
+void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,
+                                    int mb_row,
+                                    int mb_col,
+                                    BLOCK_SIZE_TYPE bsize);
 
-void vp9_build_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                         uint8_t *dst_y,
-                                         int dst_ystride,
-                                         int mb_row,
-                                         int mb_col);
+void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd,
+                                     int mb_row,
+                                     int mb_col,
+                                     BLOCK_SIZE_TYPE bsize);
 
-void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                          uint8_t *dst_u,
-                                          uint8_t *dst_v,
-                                          int dst_uvstride,
-                                          int mb_row,
-                                          int mb_col);
+void vp9_build_inter_predictors_sb(MACROBLOCKD *mb,
+                                   int mb_row, int mb_col,
+                                   BLOCK_SIZE_TYPE bsize);
 
-void vp9_build_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                        uint8_t *dst_y,
-                                        uint8_t *dst_u,
-                                        uint8_t *dst_v,
-                                        int dst_ystride,
-                                        int dst_uvstride,
-                                        int mb_row,
-                                        int mb_col);
-
-void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
-                                        uint8_t *dst_y,
-                                        uint8_t *dst_u,
-                                        uint8_t *dst_v,
-                                        int dst_ystride,
-                                        int dst_uvstride,
-                                        int mb_row,
-                                        int mb_col);
-
-void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
-                                        uint8_t *dst_y,
-                                        uint8_t *dst_u,
-                                        uint8_t *dst_v,
-                                        int dst_ystride,
-                                        int dst_uvstride,
-                                        int mb_row,
-                                        int mb_col);
-
-void vp9_build_inter_predictors_mb(MACROBLOCKD *xd,
-                                   int mb_row,
-                                   int mb_col);
-
-void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd,
-                                        int mb_row,
-                                        int mb_col);
-
 void vp9_setup_interp_filters(MACROBLOCKD *xd,
                               INTERPOLATIONFILTERTYPE filter,
                               VP9_COMMON *cm);
 
 void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
-                                       YV12_BUFFER_CONFIG *other,
+                                       int other_w, int other_h,
                                        int this_w, int this_h);
 
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
@@ -81,51 +46,73 @@
 
 void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride,
                                   uint8_t *dst, int dst_stride,
-                                  const int_mv *fullpel_mv_q3,
-                                  const int_mv *frac_mv_q4,
+                                  const int_mv *mv_q4,
                                   const struct scale_factors *scale,
                                   int w, int h, int do_avg,
                                   const struct subpix_fn_table *subpix);
 
-static int scale_value_x(int val, const struct scale_factors *scale) {
-  return val * scale->x_num / scale->x_den;
+static int scaled_buffer_offset(int x_offset, int y_offset, int stride,
+                                const struct scale_factors *scale) {
+  const int x = scale ? scale->scale_value_x(x_offset, scale) : x_offset;
+  const int y = scale ? scale->scale_value_y(y_offset, scale) : y_offset;
+  return y * stride + x;
 }
 
-static int scale_value_y(int val, const struct scale_factors *scale) {
-  return val * scale->y_num / scale->y_den;
+static void setup_pred_plane(struct buf_2d *dst,
+                             uint8_t *src, int stride,
+                             int mi_row, int mi_col,
+                             const struct scale_factors *scale,
+                             int subsampling_x, int subsampling_y) {
+  const int x = (MI_SIZE * mi_col) >> subsampling_x;
+  const int y = (MI_SIZE * mi_row) >> subsampling_y;
+  dst->buf = src + scaled_buffer_offset(x, y, stride, scale);
+  dst->stride = stride;
 }
 
-static int scaled_buffer_offset(int x_offset,
-                                int y_offset,
-                                int stride,
-                                const struct scale_factors *scale) {
-  return scale_value_y(y_offset, scale) * stride +
-      scale_value_x(x_offset, scale);
+// TODO(jkoleszar): audit all uses of this that don't set mb_row, mb_col
+static void setup_dst_planes(MACROBLOCKD *xd,
+                             const YV12_BUFFER_CONFIG *src,
+                             int mi_row, int mi_col) {
+  uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                         src->alpha_buffer};
+  int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                    src->alpha_stride};
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    struct macroblockd_plane *pd = &xd->plane[i];
+    setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL,
+                     pd->subsampling_x, pd->subsampling_y);
+  }
 }
 
-static void setup_pred_block(YV12_BUFFER_CONFIG *dst,
-                             const YV12_BUFFER_CONFIG *src,
-                             int mb_row, int mb_col,
+static void setup_pre_planes(MACROBLOCKD *xd,
+                             const YV12_BUFFER_CONFIG *src0,
+                             const YV12_BUFFER_CONFIG *src1,
+                             int mi_row, int mi_col,
                              const struct scale_factors *scale,
                              const struct scale_factors *scale_uv) {
-  const int recon_y_stride = src->y_stride;
-  const int recon_uv_stride = src->uv_stride;
-  int recon_yoffset;
-  int recon_uvoffset;
+  const YV12_BUFFER_CONFIG *srcs[2] = {src0, src1};
+  int i, j;
 
-  if (scale) {
-    recon_yoffset = scaled_buffer_offset(16 * mb_col, 16 * mb_row,
-                                         recon_y_stride, scale);
-    recon_uvoffset = scaled_buffer_offset(8 * mb_col, 8 * mb_row,
-                                          recon_uv_stride, scale_uv);
-  } else {
-    recon_yoffset = 16 * mb_row * recon_y_stride + 16 * mb_col;
-    recon_uvoffset = 8 * mb_row * recon_uv_stride + 8 * mb_col;
+  for (i = 0; i < 2; ++i) {
+    const YV12_BUFFER_CONFIG *src = srcs[i];
+    if (src) {
+      uint8_t* buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                             src->alpha_buffer};
+      int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                        src->alpha_stride};
+
+      for (j = 0; j < MAX_MB_PLANE; ++j) {
+        struct macroblockd_plane *pd = &xd->plane[j];
+        const struct scale_factors *sf = j ? scale_uv : scale;
+        setup_pred_plane(&pd->pre[i],
+                         buffers[j], strides[j],
+                         mi_row, mi_col, sf ? &sf[i] : NULL,
+                         pd->subsampling_x, pd->subsampling_y);
+      }
+    }
   }
-  *dst = *src;
-  dst->y_buffer += recon_yoffset;
-  dst->u_buffer += recon_uvoffset;
-  dst->v_buffer += recon_uvoffset;
 }
 
 static void set_scale_factors(MACROBLOCKD *xd,
@@ -137,5 +124,7 @@
   xd->scale_factor_uv[0] = xd->scale_factor[0];
   xd->scale_factor_uv[1] = xd->scale_factor[1];
 }
+
+void vp9_setup_scale_factors(VP9_COMMON *cm, int i);
 
 #endif  // VP9_COMMON_VP9_RECONINTER_H_
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -13,773 +13,345 @@
 #include "./vpx_config.h"
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_onyxc_int.h"
 #include "vpx_mem/vpx_mem.h"
 
-// For skip_recon_mb(), add vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd)
-// and vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd).
-
-// Using multiplication and shifting instead of division in diagonal prediction.
-// iscale table is calculated from ((1 << 16) + (i + 2) / 2) / (i+2) and used as
-// ((A + B) * iscale[i] + (1 << 15)) >> 16;
-// where A and B are weighted pixel values.
-static const unsigned int iscale[64] = {
-  32768, 21845, 16384, 13107, 10923,  9362,  8192,  7282,
-   6554,  5958,  5461,  5041,  4681,  4369,  4096,  3855,
-   3641,  3449,  3277,  3121,  2979,  2849,  2731,  2621,
-   2521,  2427,  2341,  2260,  2185,  2114,  2048,  1986,
-   1928,  1872,  1820,  1771,  1725,  1680,  1638,  1598,
-   1560,  1524,  1489,  1456,  1425,  1394,  1365,  1337,
-   1311,  1285,  1260,  1237,  1214,  1192,  1170,  1150,
-   1130,  1111,  1092,  1074,  1057,  1040,  1024,  1008,
-};
-
-static INLINE int iscale_round(int value, int i) {
-    return ROUND_POWER_OF_TWO(value * iscale[i], 16);
-}
-
-static void d27_predictor(uint8_t *ypred_ptr, int y_stride, int n,
+static void d27_predictor(uint8_t *ypred_ptr, int y_stride,
+                          int bw, int bh,
                           uint8_t *yabove_row, uint8_t *yleft_col) {
   int r, c;
-
-  r = 0;
-  for (c = 0; c < n - 2; c++) {
-    int a = c & 1 ? yleft_col[r + 1]
-                  : ROUND_POWER_OF_TWO(yleft_col[r] + yleft_col[r + 1], 1);
-    int b = yabove_row[c + 2];
-    ypred_ptr[c] = iscale_round(2 * a + (c + 1) * b, 1 + c);
+  // first column
+  for (r = 0; r < bh - 1; ++r) {
+      ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r] +
+                                                   yleft_col[r + 1], 1);
   }
-
-  for (r = 1; r < n / 2 - 1; r++) {
-    for (c = 0; c < n - 2 - 2 * r; c++) {
-      int a = c & 1 ? yleft_col[r + 1]
-                    : ROUND_POWER_OF_TWO(yleft_col[r] + yleft_col[r + 1], 1);
-      int b = ypred_ptr[(r - 1) * y_stride + c + 2];
-      ypred_ptr[r * y_stride + c] = iscale_round(2 * a + (c + 1) * b, 1 + c);
-    }
+  ypred_ptr[(bh - 1) * y_stride] = yleft_col[bh-1];
+  ypred_ptr++;
+  // second column
+  for (r = 0; r < bh - 2; ++r) {
+      ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r] +
+                                                   yleft_col[r + 1] * 2 +
+                                                   yleft_col[r + 2], 2);
   }
+  ypred_ptr[(bh - 2) * y_stride] = ROUND_POWER_OF_TWO(yleft_col[bh - 2] +
+                                                      yleft_col[bh - 1] * 3,
+                                                      2);
+  ypred_ptr[(bh - 1) * y_stride] = yleft_col[bh-1];
+  ypred_ptr++;
 
-  for (; r < n - 1; r++) {
-    for (c = 0; c < n; c++) {
-      int v = c & 1 ? yleft_col[r + 1]
-                    : ROUND_POWER_OF_TWO(yleft_col[r] + yleft_col[r + 1], 1);
-      int h = r - c / 2;
-      ypred_ptr[h * y_stride + c] = v;
-    }
+  // rest of last row
+  for (c = 0; c < bw - 2; ++c) {
+    ypred_ptr[(bh - 1) * y_stride + c] = yleft_col[bh-1];
   }
 
-  c = 0;
-  r = n - 1;
-  ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(ypred_ptr[(r - 1) * y_stride] +
-                                               yleft_col[r], 1);
-  for (r = n - 2; r >= n / 2; --r) {
-    int w = c + (n - 1 - r) * 2;
-    ypred_ptr[r * y_stride + w] =
-        ROUND_POWER_OF_TWO(ypred_ptr[(r - 1) * y_stride + w] +
-                           ypred_ptr[r * y_stride + w - 1], 1);
-  }
-
-  for (c = 1; c < n; c++) {
-    for (r = n - 1; r >= n / 2 + c / 2; --r) {
-      int w = c + (n - 1 - r) * 2;
-      ypred_ptr[r * y_stride + w] =
-          ROUND_POWER_OF_TWO(ypred_ptr[(r - 1) * y_stride + w] +
-                             ypred_ptr[r * y_stride + w - 1], 1);
+  for (r = bh - 2; r >= 0; --r) {
+    for (c = 0; c < bw - 2; ++c) {
+      ypred_ptr[r * y_stride + c] = ypred_ptr[(r + 1) * y_stride + c - 2];
     }
   }
 }
 
-static void d63_predictor(uint8_t *ypred_ptr, int y_stride, int n,
+static void d63_predictor(uint8_t *ypred_ptr, int y_stride,
+                          int bw, int bh,
                           uint8_t *yabove_row, uint8_t *yleft_col) {
   int r, c;
-
-  c = 0;
-  for (r = 0; r < n - 2; r++) {
-    int a = r & 1 ? yabove_row[c + 1]
-                  : ROUND_POWER_OF_TWO(yabove_row[c] + yabove_row[c + 1], 1);
-    int b = yleft_col[r + 2];
-    ypred_ptr[r * y_stride] = iscale_round(2 * a + (r + 1) * b, 1 + r);
-  }
-
-  for (c = 1; c < n / 2 - 1; c++) {
-    for (r = 0; r < n - 2 - 2 * c; r++) {
-      int a = r & 1 ? yabove_row[c + 1]
-                    : ROUND_POWER_OF_TWO(yabove_row[c] + yabove_row[c + 1], 1);
-      int b = ypred_ptr[(r + 2) * y_stride + c - 1];
-      ypred_ptr[r * y_stride + c] = iscale_round(2 * a + (c + 1) * b, 1 + c);
+  for (r = 0; r < bh; ++r) {
+    for (c = 0; c < bw; ++c) {
+      if (r & 1) {
+        ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[r/2 + c] +
+                                          yabove_row[r/2 + c + 1] * 2 +
+                                          yabove_row[r/2 + c + 2], 2);
+      } else {
+        ypred_ptr[c] =ROUND_POWER_OF_TWO(yabove_row[r/2 + c] +
+                                         yabove_row[r/2+ c + 1], 1);
+      }
     }
+    ypred_ptr += y_stride;
   }
-
-  for (; c < n - 1; ++c) {
-    for (r = 0; r < n; r++) {
-      int v = r & 1 ? yabove_row[c + 1]
-                    : ROUND_POWER_OF_TWO(yabove_row[c] + yabove_row[c + 1], 1);
-      int w = c - r / 2;
-      ypred_ptr[r * y_stride + w] = v;
-    }
-  }
-
-  r = 0;
-  c = n - 1;
-  ypred_ptr[c] = ROUND_POWER_OF_TWO(ypred_ptr[(c - 1)] + yabove_row[c], 1);
-  for (c = n - 2; c >= n / 2; --c) {
-    int h = r + (n - 1 - c) * 2;
-    ypred_ptr[h * y_stride + c] =
-         ROUND_POWER_OF_TWO(ypred_ptr[h * y_stride + c - 1] +
-                            ypred_ptr[(h - 1) * y_stride + c], 1);
-  }
-
-  for (r = 1; r < n; r++) {
-    for (c = n - 1; c >= n / 2 + r / 2; --c) {
-      int h = r + (n - 1 - c) * 2;
-      ypred_ptr[h * y_stride + c] =
-          ROUND_POWER_OF_TWO(ypred_ptr[h * y_stride + c - 1] +
-                             ypred_ptr[(h - 1) * y_stride + c], 1);
-    }
-  }
 }
 
-static void d45_predictor(uint8_t *ypred_ptr, int y_stride, int n,
+static void d45_predictor(uint8_t *ypred_ptr, int y_stride,
+                          int bw, int bh,
                           uint8_t *yabove_row, uint8_t *yleft_col) {
   int r, c;
-
-  for (r = 0; r < n - 1; ++r) {
-    for (c = 0; c <= r; ++c) {
-      ypred_ptr[(r - c) * y_stride + c] = iscale_round(
-          yabove_row[r + 1] * (c + 1) + yleft_col[r + 1] * (r - c + 1), r);
+  for (r = 0; r < bh; ++r) {
+    for (c = 0; c < bw; ++c) {
+      if (r + c + 2 < bw * 2)
+        ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[r + c] +
+                                          yabove_row[r + c + 1] * 2 +
+                                          yabove_row[r + c + 2], 2);
+      else
+        ypred_ptr[c] = yabove_row[bw * 2 - 1];
     }
+    ypred_ptr += y_stride;
   }
-
-  for (c = 0; c <= r; ++c) {
-    int yabove_ext = yabove_row[r];  // clip_pixel(2 * yabove_row[r] -
-                                     //            yabove_row[r - 1]);
-    int yleft_ext = yleft_col[r];  // clip_pixel(2 * yleft_col[r] -
-                                   //            yleft_col[r-1]);
-    ypred_ptr[(r - c) * y_stride + c] =
-         iscale_round(yabove_ext * (c + 1) + yleft_ext * (r - c + 1), r);
-  }
-  for (r = 1; r < n; ++r) {
-    for (c = n - r; c < n; ++c) {
-      const int yabove_ext = ypred_ptr[(r - 1) * y_stride + c];
-      const int yleft_ext = ypred_ptr[r * y_stride + c - 1];
-      ypred_ptr[r * y_stride + c] =
-          ROUND_POWER_OF_TWO(yabove_ext + yleft_ext, 1);
-    }
-  }
 }
 
-static void d117_predictor(uint8_t *ypred_ptr, int y_stride, int n,
+static void d117_predictor(uint8_t *ypred_ptr, int y_stride,
+                           int bw, int bh,
                            uint8_t *yabove_row, uint8_t *yleft_col) {
   int r, c;
-  for (c = 0; c < n; c++)
+  // first row
+  for (c = 0; c < bw; c++)
     ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 1] + yabove_row[c], 1);
   ypred_ptr += y_stride;
-  for (c = 0; c < n; c++)
-    ypred_ptr[c] = yabove_row[c - 1];
+
+  // second row
+  ypred_ptr[0] = ROUND_POWER_OF_TWO(yleft_col[0] +
+                                    yabove_row[-1] * 2 +
+                                    yabove_row[0], 2);
+  for (c = 1; c < bw; c++)
+    ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 2] +
+                                      yabove_row[c - 1] * 2 +
+                                      yabove_row[c], 2);
   ypred_ptr += y_stride;
-  for (r = 2; r < n; ++r) {
-    ypred_ptr[0] = yleft_col[r - 2];
-    for (c = 1; c < n; c++)
+
+  // the rest of first col
+  ypred_ptr[0] = ROUND_POWER_OF_TWO(yabove_row[-1] +
+                                    yleft_col[0] * 2 +
+                                    yleft_col[1], 2);
+  for (r = 3; r < bh; ++r)
+    ypred_ptr[(r-2) * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r - 3] +
+                                                     yleft_col[r - 2] * 2 +
+                                                     yleft_col[r - 1], 2);
+  // the rest of the block
+  for (r = 2; r < bh; ++r) {
+    for (c = 1; c < bw; c++)
       ypred_ptr[c] = ypred_ptr[-2 * y_stride + c - 1];
     ypred_ptr += y_stride;
   }
 }
 
-static void d135_predictor(uint8_t *ypred_ptr, int y_stride, int n,
+
+static void d135_predictor(uint8_t *ypred_ptr, int y_stride,
+                           int bw, int bh,
                            uint8_t *yabove_row, uint8_t *yleft_col) {
   int r, c;
-  ypred_ptr[0] = yabove_row[-1];
-  for (c = 1; c < n; c++)
-    ypred_ptr[c] = yabove_row[c - 1];
-  for (r = 1; r < n; ++r)
-    ypred_ptr[r * y_stride] = yleft_col[r - 1];
+  ypred_ptr[0] = ROUND_POWER_OF_TWO(yleft_col[0] +
+                                    yabove_row[-1] * 2 +
+                                    yabove_row[0], 2);
+  for (c = 1; c < bw; c++)
+    ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 2] +
+                                      yabove_row[c - 1] * 2 +
+                                      yabove_row[c], 2);
 
+  ypred_ptr[y_stride] = ROUND_POWER_OF_TWO(yabove_row[-1] +
+                                           yleft_col[0] * 2 +
+                                           yleft_col[1], 2);
+  for (r = 2; r < bh; ++r)
+    ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r - 2] +
+                                                 yleft_col[r - 1] * 2 +
+                                                 yleft_col[r], 2);
+
   ypred_ptr += y_stride;
-  for (r = 1; r < n; ++r) {
-    for (c = 1; c < n; c++) {
+  for (r = 1; r < bh; ++r) {
+    for (c = 1; c < bw; c++)
       ypred_ptr[c] = ypred_ptr[-y_stride + c - 1];
-    }
     ypred_ptr += y_stride;
   }
 }
 
-static void d153_predictor(uint8_t *ypred_ptr, int y_stride, int n,
-                           uint8_t *yabove_row, uint8_t *yleft_col) {
+static void d153_predictor(uint8_t *ypred_ptr,
+                           int y_stride,
+                           int bw, int bh,
+                           uint8_t *yabove_row,
+                           uint8_t *yleft_col) {
   int r, c;
   ypred_ptr[0] = ROUND_POWER_OF_TWO(yabove_row[-1] + yleft_col[0], 1);
-  for (r = 1; r < n; r++)
+  for (r = 1; r < bh; r++)
     ypred_ptr[r * y_stride] =
         ROUND_POWER_OF_TWO(yleft_col[r - 1] + yleft_col[r], 1);
   ypred_ptr++;
-  ypred_ptr[0] = yabove_row[-1];
-  for (r = 1; r < n; r++)
-    ypred_ptr[r * y_stride] = yleft_col[r - 1];
+
+  ypred_ptr[0] = ROUND_POWER_OF_TWO(yleft_col[0] +
+                                    yabove_row[-1] * 2 +
+                                    yabove_row[0], 2);
+  ypred_ptr[y_stride] = ROUND_POWER_OF_TWO(yabove_row[-1] +
+                                           yleft_col[0] * 2 +
+                                           yleft_col[1], 2);
+  for (r = 2; r < bh; r++)
+    ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r - 2] +
+                                                 yleft_col[r - 1] * 2 +
+                                                 yleft_col[r], 2);
   ypred_ptr++;
 
-  for (c = 0; c < n - 2; c++)
-    ypred_ptr[c] = yabove_row[c];
+  for (c = 0; c < bw - 2; c++)
+    ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 1] +
+                                      yabove_row[c] * 2 +
+                                      yabove_row[c + 1], 2);
   ypred_ptr += y_stride;
-  for (r = 1; r < n; ++r) {
-    for (c = 0; c < n - 2; c++)
+  for (r = 1; r < bh; ++r) {
+    for (c = 0; c < bw - 2; c++)
       ypred_ptr[c] = ypred_ptr[-y_stride + c - 2];
     ypred_ptr += y_stride;
   }
 }
 
-static void corner_predictor(uint8_t *ypred_ptr, int y_stride, int n,
-                             uint8_t *yabove_row,
-                             uint8_t *yleft_col) {
-  int mh, mv, maxgradh, maxgradv, x, y, nx, ny;
-  int i, j;
-  int top_left = yabove_row[-1];
-  mh = mv = 0;
-  maxgradh = yabove_row[1] - top_left;
-  maxgradv = yleft_col[1] - top_left;
-  for (i = 2; i < n; ++i) {
-    int gh = yabove_row[i] - yabove_row[i - 2];
-    int gv = yleft_col[i] - yleft_col[i - 2];
-    if (gh > maxgradh) {
-      maxgradh = gh;
-      mh = i - 1;
-    }
-    if (gv > maxgradv) {
-      maxgradv = gv;
-      mv = i - 1;
-    }
-  }
-  nx = mh + mv + 3;
-  ny = 2 * n + 1 - nx;
-
-  x = top_left;
-  for (i = 0; i <= mh; ++i) x += yabove_row[i];
-  for (i = 0; i <= mv; ++i) x += yleft_col[i];
-  x += (nx >> 1);
-  x /= nx;
-  y = 0;
-  for (i = mh + 1; i < n; ++i) y += yabove_row[i];
-  for (i = mv + 1; i < n; ++i) y += yleft_col[i];
-  y += (ny >> 1);
-  y /= ny;
-
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n; ++j)
-      ypred_ptr[j] = (i <= mh && j <= mv ? x : y);
-    ypred_ptr += y_stride;
-  }
-}
-
-void vp9_recon_intra_mbuv(MACROBLOCKD *xd) {
-  int i;
-  for (i = 16; i < 24; i += 2) {
-    BLOCKD *b = &xd->block[i];
-    vp9_recon2b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-  }
-}
-
-static INLINE int log2_minus_1(int n) {
-  switch (n) {
-    case 4: return 1;
-    case 8: return 2;
-    case 16: return 3;
-    case 32: return 4;
-    case 64: return 5;
-    default:
-      assert(0);
-      return 0;
-  }
-}
-
-
-void vp9_build_intra_predictors_internal(uint8_t *src, int src_stride,
-                                         uint8_t *ypred_ptr,
-                                         int y_stride, int mode, int bsize,
-                                         int up_available, int left_available,
-                                         int right_available) {
+void vp9_build_intra_predictors(uint8_t *src, int src_stride,
+                                uint8_t *ypred_ptr,
+                                int y_stride, int mode,
+                                int bw, int bh,
+                                int up_available, int left_available,
+                                int right_available) {
   int r, c, i;
-  uint8_t yleft_col[64], yabove_data[65], ytop_left;
+  uint8_t yleft_col[64], yabove_data[129], ytop_left;
   uint8_t *yabove_row = yabove_data + 1;
-  /*
-   * 127 127 127 .. 127 127 127 127 127 127
-   * 129  A   B  ..  Y   Z
-   * 129  C   D  ..  W   X
-   * 129  E   F  ..  U   V
-   * 129  G   H  ..  S   T   T   T   T   T
-   *  ..
-   */
 
+  // 127 127 127 .. 127 127 127 127 127 127
+  // 129  A   B  ..  Y   Z
+  // 129  C   D  ..  W   X
+  // 129  E   F  ..  U   V
+  // 129  G   H  ..  S   T   T   T   T   T
+  // ..
+
+  assert(bw == bh);
+
   if (left_available) {
-    for (i = 0; i < bsize; i++)
+    for (i = 0; i < bh; i++)
       yleft_col[i] = src[i * src_stride - 1];
   } else {
-    vpx_memset(yleft_col, 129, bsize);
+    vpx_memset(yleft_col, 129, bh);
   }
 
   if (up_available) {
     uint8_t *yabove_ptr = src - src_stride;
-    vpx_memcpy(yabove_row, yabove_ptr, bsize);
-    if (left_available) {
-      ytop_left = yabove_ptr[-1];
-    } else {
-      ytop_left = 127;
-    }
+    vpx_memcpy(yabove_row, yabove_ptr, bw);
+    if (bw == 4 && right_available)
+      vpx_memcpy(yabove_row + bw, yabove_ptr + bw, bw);
+    else
+      vpx_memset(yabove_row + bw, yabove_row[bw -1], bw);
+    ytop_left = left_available ? yabove_ptr[-1] : 129;
   } else {
-    vpx_memset(yabove_row, 127, bsize);
+    vpx_memset(yabove_row, 127, bw * 2);
     ytop_left = 127;
   }
   yabove_row[-1] = ytop_left;
 
-  /* for Y */
   switch (mode) {
     case DC_PRED: {
-      int expected_dc;
       int i;
-      int shift;
+      int expected_dc = 128;
       int average = 0;
-      int log2_bsize_minus_1 = log2_minus_1(bsize);
+      int count = 0;
 
       if (up_available || left_available) {
         if (up_available) {
-          for (i = 0; i < bsize; i++) {
+          for (i = 0; i < bw; i++)
             average += yabove_row[i];
-          }
+          count += bw;
         }
-
         if (left_available) {
-          for (i = 0; i < bsize; i++) {
+          for (i = 0; i < bh; i++)
             average += yleft_col[i];
-          }
+          count += bh;
         }
-        shift = log2_bsize_minus_1 + up_available + left_available;
-        expected_dc = ROUND_POWER_OF_TWO(average, shift);
-      } else {
-        expected_dc = 128;
+        expected_dc = (average + (count >> 1)) / count;
       }
-
-      for (r = 0; r < bsize; r++) {
-        vpx_memset(ypred_ptr, expected_dc, bsize);
+      for (r = 0; r < bh; r++) {
+        vpx_memset(ypred_ptr, expected_dc, bw);
         ypred_ptr += y_stride;
       }
     }
     break;
     case V_PRED:
-      for (r = 0; r < bsize; r++) {
-        memcpy(ypred_ptr, yabove_row, bsize);
+      for (r = 0; r < bh; r++) {
+        vpx_memcpy(ypred_ptr, yabove_row, bw);
         ypred_ptr += y_stride;
       }
       break;
     case H_PRED:
-      for (r = 0; r < bsize; r++) {
-        vpx_memset(ypred_ptr, yleft_col[r], bsize);
+      for (r = 0; r < bh; r++) {
+        vpx_memset(ypred_ptr, yleft_col[r], bw);
         ypred_ptr += y_stride;
       }
       break;
     case TM_PRED:
-      for (r = 0; r < bsize; r++) {
-        for (c = 0; c < bsize; c++) {
+      for (r = 0; r < bh; r++) {
+        for (c = 0; c < bw; c++)
           ypred_ptr[c] = clip_pixel(yleft_col[r] + yabove_row[c] - ytop_left);
-        }
-
         ypred_ptr += y_stride;
       }
       break;
     case D45_PRED:
-      d45_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
+      d45_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);
       break;
     case D135_PRED:
-      d135_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
+      d135_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);
       break;
     case D117_PRED:
-      d117_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
+      d117_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);
       break;
     case D153_PRED:
-      d153_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
+      d153_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);
       break;
     case D27_PRED:
-      d27_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
+      d27_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);
       break;
     case D63_PRED:
-      d63_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
+      d63_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col);
       break;
-    case I8X8_PRED:
-    case B_PRED:
-    case NEARESTMV:
-    case NEARMV:
-    case ZEROMV:
-    case NEWMV:
-    case SPLITMV:
-    case MB_MODE_COUNT:
-      break;
-  }
-}
-
-#if CONFIG_COMP_INTERINTRA_PRED
-static void combine_interintra(MB_PREDICTION_MODE mode,
-                               uint8_t *interpred,
-                               int interstride,
-                               uint8_t *intrapred,
-                               int intrastride,
-                               int size) {
-  // TODO(debargha): Explore different ways of combining predictors
-  //                 or designing the tables below
-  static const int scale_bits = 8;
-  static const int scale_max = 256;     // 1 << scale_bits;
-  static const int scale_round = 127;   // (1 << (scale_bits - 1));
-  // This table is a function A + B*exp(-kx), where x is hor. index
-  static const int weights1d[64] = {
-    128, 125, 122, 119, 116, 114, 111, 109,
-    107, 105, 103, 101,  99,  97,  96,  94,
-     93,  91,  90,  89,  88,  86,  85,  84,
-     83,  82,  81,  81,  80,  79,  78,  78,
-     77,  76,  76,  75,  75,  74,  74,  73,
-     73,  72,  72,  71,  71,  71,  70,  70,
-     70,  70,  69,  69,  69,  69,  68,  68,
-     68,  68,  68,  67,  67,  67,  67,  67,
-  };
-
-  int size_scale = (size >= 64 ? 1:
-                    size == 32 ? 2 :
-                    size == 16 ? 4 :
-                    size == 8  ? 8 : 16);
-  int i, j;
-  switch (mode) {
-    case V_PRED:
-      for (i = 0; i < size; ++i) {
-        for (j = 0; j < size; ++j) {
-          int k = i * interstride + j;
-          int scale = weights1d[i * size_scale];
-          interpred[k] =
-              ((scale_max - scale) * interpred[k] +
-               scale * intrapred[i * intrastride + j] + scale_round)
-              >> scale_bits;
-        }
-      }
-      break;
-
-    case H_PRED:
-      for (i = 0; i < size; ++i) {
-        for (j = 0; j < size; ++j) {
-          int k = i * interstride + j;
-          int scale = weights1d[j * size_scale];
-          interpred[k] =
-              ((scale_max - scale) * interpred[k] +
-               scale * intrapred[i * intrastride + j] + scale_round)
-              >> scale_bits;
-        }
-      }
-      break;
-
-    case D63_PRED:
-    case D117_PRED:
-      for (i = 0; i < size; ++i) {
-        for (j = 0; j < size; ++j) {
-          int k = i * interstride + j;
-          int scale = (weights1d[i * size_scale] * 3 +
-                       weights1d[j * size_scale]) >> 2;
-          interpred[k] =
-              ((scale_max - scale) * interpred[k] +
-               scale * intrapred[i * intrastride + j] + scale_round)
-              >> scale_bits;
-        }
-      }
-      break;
-
-    case D27_PRED:
-    case D153_PRED:
-      for (i = 0; i < size; ++i) {
-        for (j = 0; j < size; ++j) {
-          int k = i * interstride + j;
-          int scale = (weights1d[j * size_scale] * 3 +
-                       weights1d[i * size_scale]) >> 2;
-          interpred[k] =
-              ((scale_max - scale) * interpred[k] +
-               scale * intrapred[i * intrastride + j] + scale_round)
-              >> scale_bits;
-        }
-      }
-      break;
-
-    case D135_PRED:
-      for (i = 0; i < size; ++i) {
-        for (j = 0; j < size; ++j) {
-          int k = i * interstride + j;
-          int scale = weights1d[(i < j ? i : j) * size_scale];
-          interpred[k] =
-              ((scale_max - scale) * interpred[k] +
-               scale * intrapred[i * intrastride + j] + scale_round)
-              >> scale_bits;
-        }
-      }
-      break;
-
-    case D45_PRED:
-      for (i = 0; i < size; ++i) {
-        for (j = 0; j < size; ++j) {
-          int k = i * interstride + j;
-          int scale = (weights1d[i * size_scale] +
-                       weights1d[j * size_scale]) >> 1;
-          interpred[k] =
-              ((scale_max - scale) * interpred[k] +
-               scale * intrapred[i * intrastride + j] + scale_round)
-              >> scale_bits;
-        }
-      }
-      break;
-
-    case TM_PRED:
-    case DC_PRED:
     default:
-      // simple average
-      for (i = 0; i < size; ++i) {
-        for (j = 0; j < size; ++j) {
-          int k = i * interstride + j;
-          interpred[k] = (interpred[k] + intrapred[i * intrastride + j]) >> 1;
-        }
-      }
       break;
   }
 }
 
-void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd,
-                                              uint8_t *ypred,
-                                              uint8_t *upred,
-                                              uint8_t *vpred,
-                                              int ystride, int uvstride) {
-  vp9_build_interintra_16x16_predictors_mby(xd, ypred, ystride);
-  vp9_build_interintra_16x16_predictors_mbuv(xd, upred, vpred, uvstride);
+void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd,
+                                      BLOCK_SIZE_TYPE bsize) {
+  const struct macroblockd_plane* const pd = &xd->plane[0];
+  const int bw = plane_block_width(bsize, pd);
+  const int bh = plane_block_height(bsize, pd);
+  vp9_build_intra_predictors(pd->dst.buf, pd->dst.stride,
+                             pd->dst.buf, pd->dst.stride,
+                             xd->mode_info_context->mbmi.mode,
+                             bw, bh, xd->up_available, xd->left_available,
+                             0 /*xd->right_available*/);
 }
 
-void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd,
-                                               uint8_t *ypred,
-                                               int ystride) {
-  uint8_t intrapredictor[256];
-  vp9_build_intra_predictors_internal(
-      xd->dst.y_buffer, xd->dst.y_stride,
-      intrapredictor, 16,
-      xd->mode_info_context->mbmi.interintra_mode, 16,
-      xd->up_available, xd->left_available, xd->right_available);
-  combine_interintra(xd->mode_info_context->mbmi.interintra_mode,
-                     ypred, ystride, intrapredictor, 16, 16);
-}
+void vp9_build_intra_predictors_sbuv_s(MACROBLOCKD *xd,
+                                       BLOCK_SIZE_TYPE bsize) {
+  const int bwl = b_width_log2(bsize), bw = 2 << bwl;
+  const int bhl = b_height_log2(bsize), bh = 2 << bhl;
 
-void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                                uint8_t *upred,
-                                                uint8_t *vpred,
-                                                int uvstride) {
-  uint8_t uintrapredictor[64];
-  uint8_t vintrapredictor[64];
-  vp9_build_intra_predictors_internal(
-      xd->dst.u_buffer, xd->dst.uv_stride,
-      uintrapredictor, 8,
-      xd->mode_info_context->mbmi.interintra_uv_mode, 8,
-      xd->up_available, xd->left_available, xd->right_available);
-  vp9_build_intra_predictors_internal(
-      xd->dst.v_buffer, xd->dst.uv_stride,
-      vintrapredictor, 8,
-      xd->mode_info_context->mbmi.interintra_uv_mode, 8,
-      xd->up_available, xd->left_available, xd->right_available);
-  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
-                     upred, uvstride, uintrapredictor, 8, 8);
-  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
-                     vpred, uvstride, vintrapredictor, 8, 8);
+  vp9_build_intra_predictors(xd->plane[1].dst.buf, xd->plane[1].dst.stride,
+                             xd->plane[1].dst.buf, xd->plane[1].dst.stride,
+                             xd->mode_info_context->mbmi.uv_mode,
+                             bw, bh, xd->up_available,
+                             xd->left_available, 0 /*xd->right_available*/);
+  vp9_build_intra_predictors(xd->plane[2].dst.buf, xd->plane[1].dst.stride,
+                             xd->plane[2].dst.buf, xd->plane[1].dst.stride,
+                             xd->mode_info_context->mbmi.uv_mode,
+                             bw, bh, xd->up_available,
+                             xd->left_available, 0 /*xd->right_available*/);
 }
 
-void vp9_build_interintra_32x32_predictors_sby(MACROBLOCKD *xd,
-                                               uint8_t *ypred,
-                                               int ystride) {
-  uint8_t intrapredictor[1024];
-  vp9_build_intra_predictors_internal(
-      xd->dst.y_buffer, xd->dst.y_stride,
-      intrapredictor, 32,
-      xd->mode_info_context->mbmi.interintra_mode, 32,
-      xd->up_available, xd->left_available, xd->right_available);
-  combine_interintra(xd->mode_info_context->mbmi.interintra_mode,
-                     ypred, ystride, intrapredictor, 32, 32);
-}
+void vp9_predict_intra_block(MACROBLOCKD *xd,
+                            int block_idx,
+                            int bwl_in,
+                            TX_SIZE tx_size,
+                            int mode,
+                            uint8_t *predictor, int pre_stride) {
+  const int bwl = bwl_in - tx_size;
+  const int wmask = (1 << bwl) - 1;
+  const int have_top = (block_idx >> bwl) || xd->up_available;
+  const int have_left = (block_idx & wmask) || xd->left_available;
+  const int have_right = ((block_idx & wmask) != wmask);
+  const int txfm_block_size = 4 << tx_size;
 
-void vp9_build_interintra_32x32_predictors_sbuv(MACROBLOCKD *xd,
-                                                uint8_t *upred,
-                                                uint8_t *vpred,
-                                                int uvstride) {
-  uint8_t uintrapredictor[256];
-  uint8_t vintrapredictor[256];
-  vp9_build_intra_predictors_internal(
-      xd->dst.u_buffer, xd->dst.uv_stride,
-      uintrapredictor, 16,
-      xd->mode_info_context->mbmi.interintra_uv_mode, 16,
-      xd->up_available, xd->left_available, xd->right_available);
-  vp9_build_intra_predictors_internal(
-      xd->dst.v_buffer, xd->dst.uv_stride,
-      vintrapredictor, 16,
-      xd->mode_info_context->mbmi.interintra_uv_mode, 16,
-      xd->up_available, xd->left_available, xd->right_available);
-  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
-                     upred, uvstride, uintrapredictor, 16, 16);
-  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
-                     vpred, uvstride, vintrapredictor, 16, 16);
+  assert(bwl >= 0);
+  vp9_build_intra_predictors(predictor, pre_stride,
+                             predictor, pre_stride,
+                             mode,
+                             txfm_block_size,
+                             txfm_block_size,
+                             have_top, have_left,
+                             have_right);
 }
 
-void vp9_build_interintra_32x32_predictors_sb(MACROBLOCKD *xd,
-                                              uint8_t *ypred,
-                                              uint8_t *upred,
-                                              uint8_t *vpred,
-                                              int ystride,
-                                              int uvstride) {
-  vp9_build_interintra_32x32_predictors_sby(xd, ypred, ystride);
-  vp9_build_interintra_32x32_predictors_sbuv(xd, upred, vpred, uvstride);
-}
-
-void vp9_build_interintra_64x64_predictors_sby(MACROBLOCKD *xd,
-                                               uint8_t *ypred,
-                                               int ystride) {
-  uint8_t intrapredictor[4096];
-  const int mode = xd->mode_info_context->mbmi.interintra_mode;
-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
-                                      intrapredictor, 64, mode, 64,
-                                      xd->up_available, xd->left_available,
-                                      xd->right_available);
-  combine_interintra(xd->mode_info_context->mbmi.interintra_mode,
-                     ypred, ystride, intrapredictor, 64, 64);
-}
-
-void vp9_build_interintra_64x64_predictors_sbuv(MACROBLOCKD *xd,
-                                                uint8_t *upred,
-                                                uint8_t *vpred,
-                                                int uvstride) {
-  uint8_t uintrapredictor[1024];
-  uint8_t vintrapredictor[1024];
-  const int mode = xd->mode_info_context->mbmi.interintra_uv_mode;
-  vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,
-                                      uintrapredictor, 32, mode, 32,
-                                      xd->up_available, xd->left_available,
-                                      xd->right_available);
-  vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride,
-                                      vintrapredictor, 32, mode, 32,
-                                      xd->up_available, xd->left_available,
-                                      xd->right_available);
-  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
-                     upred, uvstride, uintrapredictor, 32, 32);
-  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
-                     vpred, uvstride, vintrapredictor, 32, 32);
-}
-
-void vp9_build_interintra_64x64_predictors_sb(MACROBLOCKD *xd,
-                                              uint8_t *ypred,
-                                              uint8_t *upred,
-                                              uint8_t *vpred,
-                                              int ystride,
-                                              int uvstride) {
-  vp9_build_interintra_64x64_predictors_sby(xd, ypred, ystride);
-  vp9_build_interintra_64x64_predictors_sbuv(xd, upred, vpred, uvstride);
-}
-#endif  // CONFIG_COMP_INTERINTRA_PRED
-
-void vp9_build_intra_predictors_mby(MACROBLOCKD *xd) {
-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
-                                      xd->predictor, 16,
-                                      xd->mode_info_context->mbmi.mode, 16,
-                                      xd->up_available, xd->left_available,
-                                      xd->right_available);
-}
-
-void vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd) {
-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
-                                      xd->dst.y_buffer, xd->dst.y_stride,
-                                      xd->mode_info_context->mbmi.mode, 16,
-                                      xd->up_available, xd->left_available,
-                                      xd->right_available);
-}
-
-void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd) {
-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
-                                      xd->dst.y_buffer, xd->dst.y_stride,
-                                      xd->mode_info_context->mbmi.mode, 32,
-                                      xd->up_available, xd->left_available,
-                                      xd->right_available);
-}
-
-void vp9_build_intra_predictors_sb64y_s(MACROBLOCKD *xd) {
-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
-                                      xd->dst.y_buffer, xd->dst.y_stride,
-                                      xd->mode_info_context->mbmi.mode, 64,
-                                      xd->up_available, xd->left_available,
-                                      xd->right_available);
-}
-
-void vp9_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd,
-                                              uint8_t *upred_ptr,
-                                              uint8_t *vpred_ptr,
-                                              int uv_stride,
-                                              int mode, int bsize) {
-  vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,
-                                      upred_ptr, uv_stride, mode, bsize,
-                                      xd->up_available, xd->left_available,
-                                      xd->right_available);
-  vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride,
-                                      vpred_ptr, uv_stride, mode, bsize,
-                                      xd->up_available, xd->left_available,
-                                      xd->right_available);
-}
-
-void vp9_build_intra_predictors_mbuv(MACROBLOCKD *xd) {
-  vp9_build_intra_predictors_mbuv_internal(xd, &xd->predictor[256],
-                                           &xd->predictor[320], 8,
-                                           xd->mode_info_context->mbmi.uv_mode,
-                                           8);
-}
-
-void vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd) {
-  vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
-                                           xd->dst.v_buffer,
-                                           xd->dst.uv_stride,
-                                           xd->mode_info_context->mbmi.uv_mode,
-                                           8);
-}
-
-void vp9_build_intra_predictors_sbuv_s(MACROBLOCKD *xd) {
-  vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
-                                           xd->dst.v_buffer, xd->dst.uv_stride,
-                                           xd->mode_info_context->mbmi.uv_mode,
-                                           16);
-}
-
-void vp9_build_intra_predictors_sb64uv_s(MACROBLOCKD *xd) {
-  vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
-                                           xd->dst.v_buffer, xd->dst.uv_stride,
-                                           xd->mode_info_context->mbmi.uv_mode,
-                                           32);
-}
-
-void vp9_intra8x8_predict(MACROBLOCKD *xd,
-                          BLOCKD *b,
+void vp9_intra4x4_predict(MACROBLOCKD *xd,
+                          int block_idx,
+                          BLOCK_SIZE_TYPE bsize,
                           int mode,
-                          uint8_t *predictor) {
-  const int block4x4_idx = (b - xd->block);
-  const int block_idx = (block4x4_idx >> 2) | !!(block4x4_idx & 2);
-  const int have_top = (block_idx >> 1) || xd->up_available;
-  const int have_left = (block_idx & 1)  || xd->left_available;
-  const int have_right = !(block_idx & 1) || xd->right_available;
-
-  vp9_build_intra_predictors_internal(*(b->base_dst) + b->dst,
-                                      b->dst_stride, predictor, 16,
-                                      mode, 8, have_top, have_left,
-                                      have_right);
+                          uint8_t *predictor, int pre_stride) {
+  vp9_predict_intra_block(xd, block_idx, b_width_log2(bsize), TX_4X4,
+                          mode, predictor, pre_stride);
 }
-
-void vp9_intra_uv4x4_predict(MACROBLOCKD *xd,
-                             BLOCKD *b,
-                             int mode,
-                             uint8_t *predictor) {
-  const int block_idx = (b - xd->block) & 3;
-  const int have_top = (block_idx >> 1) || xd->up_available;
-  const int have_left = (block_idx & 1)  || xd->left_available;
-  const int have_right = !(block_idx & 1) || xd->right_available;
-
-  vp9_build_intra_predictors_internal(*(b->base_dst) + b->dst,
-                                      b->dst_stride, predictor, 8,
-                                      mode, 4, have_top, have_left,
-                                      have_right);
-}
-
-/* TODO: try different ways of use Y-UV mode correlation
-   Current code assumes that a uv 4x4 block use same mode
-   as corresponding Y 8x8 area
-   */
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -14,44 +14,17 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
-void vp9_recon_intra_mbuv(MACROBLOCKD *xd);
+MB_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
+                                               int stride, int n,
+                                               int tx, int ty);
 
-B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
-                                              int stride, int n,
-                                              int tx, int ty);
+MB_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, int block,
+                                          uint8_t *ptr, int stride);
 
-B_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, BLOCKD *x);
-
-#if CONFIG_COMP_INTERINTRA_PRED
-void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd,
-                                              uint8_t *ypred,
-                                              uint8_t *upred,
-                                              uint8_t *vpred,
-                                              int ystride,
-                                              int uvstride);
-
-void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd,
-                                               uint8_t *ypred,
-                                               int ystride);
-
-void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                                uint8_t *upred,
-                                                uint8_t *vpred,
-                                                int uvstride);
-#endif  // CONFIG_COMP_INTERINTRA_PRED
-
-void vp9_build_interintra_32x32_predictors_sb(MACROBLOCKD *xd,
-                                              uint8_t *ypred,
-                                              uint8_t *upred,
-                                              uint8_t *vpred,
-                                              int ystride,
-                                              int uvstride);
-
-void vp9_build_interintra_64x64_predictors_sb(MACROBLOCKD *xd,
-                                              uint8_t *ypred,
-                                              uint8_t *upred,
-                                              uint8_t *vpred,
-                                              int ystride,
-                                              int uvstride);
-
+void vp9_predict_intra_block(MACROBLOCKD *xd,
+                            int block_idx,
+                            int bwl_in,
+                            TX_SIZE tx_size,
+                            int mode,
+                            uint8_t *predictor, int pre_stride);
 #endif  // VP9_COMMON_VP9_RECONINTRA_H_
--- a/vp9/common/vp9_reconintra4x4.c
+++ /dev/null
@@ -1,503 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "./vpx_config.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_reconintra.h"
-#include "vp9_rtcd.h"
-
-#if CONFIG_NEWBINTRAMODES
-static int find_grad_measure(uint8_t *x, int stride, int n, int tx, int ty,
-                             int dx, int dy) {
-  int i, j;
-  int count = 0, gsum = 0, gdiv;
-  /* TODO: Make this code more efficient by breaking up into two loops */
-  for (i = -ty; i < n; ++i)
-    for (j = -tx; j < n; ++j) {
-      int g;
-      if (i >= 0 && j >= 0) continue;
-      if (i + dy >= 0 && j + dx >= 0) continue;
-      if (i + dy < -ty || i + dy >= n || j + dx < -tx || j + dx >= n) continue;
-      g = abs(x[(i + dy) * stride + j + dx] - x[i * stride + j]);
-      gsum += g * g;
-      count++;
-    }
-  gdiv = (dx * dx + dy * dy) * count;
-  return ((gsum << 8) + (gdiv >> 1)) / gdiv;
-}
-
-#if CONTEXT_PRED_REPLACEMENTS == 6
-B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
-                                              int stride, int n,
-                                              int tx, int ty) {
-  int g[8], i, imin, imax;
-  g[1] = find_grad_measure(ptr, stride, n, tx, ty,  2, 1);
-  g[2] = find_grad_measure(ptr, stride, n, tx, ty,  1, 1);
-  g[3] = find_grad_measure(ptr, stride, n, tx, ty,  1, 2);
-  g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);
-  g[6] = find_grad_measure(ptr, stride, n, tx, ty, -1, 1);
-  g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);
-  imin = 1;
-  for (i = 2; i < 8; i += 1 + (i == 3))
-    imin = (g[i] < g[imin] ? i : imin);
-  imax = 1;
-  for (i = 2; i < 8; i += 1 + (i == 3))
-    imax = (g[i] > g[imax] ? i : imax);
-  /*
-  printf("%d %d %d %d %d %d = %d %d\n",
-         g[1], g[2], g[3], g[5], g[6], g[7], imin, imax);
-         */
-  switch (imin) {
-    case 1:
-      return B_HD_PRED;
-    case 2:
-      return B_RD_PRED;
-    case 3:
-      return B_VR_PRED;
-    case 5:
-      return B_VL_PRED;
-    case 6:
-      return B_LD_PRED;
-    case 7:
-      return B_HU_PRED;
-    default:
-      assert(0);
-  }
-}
-#elif CONTEXT_PRED_REPLACEMENTS == 4
-B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
-                                              int stride, int n,
-                                              int tx, int ty) {
-  int g[8], i, imin, imax;
-  g[1] = find_grad_measure(ptr, stride, n, tx, ty,  2, 1);
-  g[3] = find_grad_measure(ptr, stride, n, tx, ty,  1, 2);
-  g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);
-  g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);
-  imin = 1;
-  for (i = 3; i < 8; i+=2)
-    imin = (g[i] < g[imin] ? i : imin);
-  imax = 1;
-  for (i = 3; i < 8; i+=2)
-    imax = (g[i] > g[imax] ? i : imax);
-  /*
-  printf("%d %d %d %d = %d %d\n",
-         g[1], g[3], g[5], g[7], imin, imax);
-         */
-  switch (imin) {
-    case 1:
-      return B_HD_PRED;
-    case 3:
-      return B_VR_PRED;
-    case 5:
-      return B_VL_PRED;
-    case 7:
-      return B_HU_PRED;
-    default:
-      assert(0);
-  }
-}
-#elif CONTEXT_PRED_REPLACEMENTS == 0
-B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
-                                              int stride, int n,
-                                              int tx, int ty) {
-  int g[8], i, imin, imax;
-  g[0] = find_grad_measure(ptr, stride, n, tx, ty,  1, 0);
-  g[1] = find_grad_measure(ptr, stride, n, tx, ty,  2, 1);
-  g[2] = find_grad_measure(ptr, stride, n, tx, ty,  1, 1);
-  g[3] = find_grad_measure(ptr, stride, n, tx, ty,  1, 2);
-  g[4] = find_grad_measure(ptr, stride, n, tx, ty,  0, 1);
-  g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);
-  g[6] = find_grad_measure(ptr, stride, n, tx, ty, -1, 1);
-  g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);
-  imax = 0;
-  for (i = 1; i < 8; i++)
-    imax = (g[i] > g[imax] ? i : imax);
-  imin = 0;
-  for (i = 1; i < 8; i++)
-    imin = (g[i] < g[imin] ? i : imin);
-
-  switch (imin) {
-    case 0:
-      return B_HE_PRED;
-    case 1:
-      return B_HD_PRED;
-    case 2:
-      return B_RD_PRED;
-    case 3:
-      return B_VR_PRED;
-    case 4:
-      return B_VE_PRED;
-    case 5:
-      return B_VL_PRED;
-    case 6:
-      return B_LD_PRED;
-    case 7:
-      return B_HU_PRED;
-    default:
-      assert(0);
-  }
-}
-#endif
-
-B_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, BLOCKD *x) {
-  const int block_idx = x - xd->block;
-  const int have_top = (block_idx >> 2) || xd->up_available;
-  const int have_left = (block_idx & 3)  || xd->left_available;
-  uint8_t *ptr = *(x->base_dst) + x->dst;
-  int stride = x->dst_stride;
-  int tx = have_left ? 4 : 0;
-  int ty = have_top ? 4 : 0;
-  if (!have_left && !have_top)
-    return B_DC_PRED;
-  return vp9_find_dominant_direction(ptr, stride, 4, tx, ty);
-}
-#endif
-
-void vp9_intra4x4_predict(MACROBLOCKD *xd,
-                          BLOCKD *x,
-                          int b_mode,
-                          uint8_t *predictor) {
-  int i, r, c;
-  const int block_idx = x - xd->block;
-  const int have_top = (block_idx >> 2) || xd->up_available;
-  const int have_left = (block_idx & 3)  || xd->left_available;
-  const int have_right = (block_idx & 3) != 3 || xd->right_available;
-  uint8_t left[4], above[8], top_left;
-  /*
-   * 127 127 127 .. 127 127 127 127 127 127
-   * 129  A   B  ..  Y   Z
-   * 129  C   D  ..  W   X
-   * 129  E   F  ..  U   V
-   * 129  G   H  ..  S   T   T   T   T   T
-   *  ..
-   */
-
-  if (have_left) {
-    uint8_t *left_ptr = *(x->base_dst) + x->dst - 1;
-    const int stride = x->dst_stride;
-
-    left[0] = left_ptr[0 * stride];
-    left[1] = left_ptr[1 * stride];
-    left[2] = left_ptr[2 * stride];
-    left[3] = left_ptr[3 * stride];
-  } else {
-    left[0] = left[1] = left[2] = left[3] = 129;
-  }
-
-  if (have_top) {
-    uint8_t *above_ptr = *(x->base_dst) + x->dst - x->dst_stride;
-
-    if (have_left) {
-      top_left = above_ptr[-1];
-    } else {
-      top_left = 127;
-    }
-
-    above[0] = above_ptr[0];
-    above[1] = above_ptr[1];
-    above[2] = above_ptr[2];
-    above[3] = above_ptr[3];
-    if (((block_idx & 3) != 3) ||
-        (have_right && block_idx == 3 &&
-         ((xd->mb_index != 3 && xd->sb_index != 3) ||
-          ((xd->mb_index & 1) == 0 && xd->sb_index == 3)))) {
-      above[4] = above_ptr[4];
-      above[5] = above_ptr[5];
-      above[6] = above_ptr[6];
-      above[7] = above_ptr[7];
-    } else if (have_right) {
-      uint8_t *above_right = above_ptr + 4;
-
-      if (xd->sb_index == 3 && (xd->mb_index & 1))
-        above_right -= 32 * x->dst_stride;
-      if (xd->mb_index == 3)
-        above_right -= 16 * x->dst_stride;
-      above_right -= (block_idx & ~3) * x->dst_stride;
-
-      /* use a more distant above-right (from closest available top-right
-       * corner), but with a "localized DC" (similar'ish to TM-pred):
-       *
-       *  A   B   C   D   E   F   G   H
-       *  I   J   K   L
-       *  M   N   O   P
-       *  Q   R   S   T
-       *  U   V   W   X   x1  x2  x3  x4
-       *
-       * Where:
-       * x1 = clip_pixel(E + X - D)
-       * x2 = clip_pixel(F + X - D)
-       * x3 = clip_pixel(G + X - D)
-       * x4 = clip_pixel(H + X - D)
-       *
-       * This is applied anytime when we use a "distant" above-right edge
-       * that is not immediately top-right to the block that we're going
-       * to do intra prediction for.
-       */
-      above[4] = clip_pixel(above_right[0] + above_ptr[3] - above_right[-1]);
-      above[5] = clip_pixel(above_right[1] + above_ptr[3] - above_right[-1]);
-      above[6] = clip_pixel(above_right[2] + above_ptr[3] - above_right[-1]);
-      above[7] = clip_pixel(above_right[3] + above_ptr[3] - above_right[-1]);
-    } else {
-      // extend edge
-      above[4] = above[5] = above[6] = above[7] = above[3];
-    }
-  } else {
-    above[0] = above[1] = above[2] = above[3] = 127;
-    above[4] = above[5] = above[6] = above[7] = 127;
-    top_left = 127;
-  }
-
-#if CONFIG_NEWBINTRAMODES
-  if (b_mode == B_CONTEXT_PRED)
-    b_mode = x->bmi.as_mode.context;
-#endif
-
-  switch (b_mode) {
-    case B_DC_PRED: {
-      int expected_dc = 0;
-
-      for (i = 0; i < 4; i++) {
-        expected_dc += above[i];
-        expected_dc += left[i];
-      }
-
-      expected_dc = (expected_dc + 4) >> 3;
-
-      for (r = 0; r < 4; r++) {
-        for (c = 0; c < 4; c++) {
-          predictor[c] = expected_dc;
-        }
-
-        predictor += 16;
-      }
-    }
-    break;
-    case B_TM_PRED: {
-      /* prediction similar to true_motion prediction */
-      for (r = 0; r < 4; r++) {
-        for (c = 0; c < 4; c++) {
-          predictor[c] = clip_pixel(above[c] - top_left + left[r]);
-        }
-
-        predictor += 16;
-      }
-    }
-    break;
-
-    case B_VE_PRED: {
-      unsigned int ap[4];
-
-      ap[0] = above[0];
-      ap[1] = above[1];
-      ap[2] = above[2];
-      ap[3] = above[3];
-
-      for (r = 0; r < 4; r++) {
-        for (c = 0; c < 4; c++) {
-          predictor[c] = ap[c];
-        }
-
-        predictor += 16;
-      }
-    }
-    break;
-
-    case B_HE_PRED: {
-      unsigned int lp[4];
-
-      lp[0] = left[0];
-      lp[1] = left[1];
-      lp[2] = left[2];
-      lp[3] = left[3];
-
-      for (r = 0; r < 4; r++) {
-        for (c = 0; c < 4; c++) {
-          predictor[c] = lp[r];
-        }
-
-        predictor += 16;
-      }
-    }
-    break;
-    case B_LD_PRED: {
-      uint8_t *ptr = above;
-
-      predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
-      predictor[0 * 16 + 1] =
-        predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
-      predictor[0 * 16 + 2] =
-        predictor[1 * 16 + 1] =
-          predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
-      predictor[0 * 16 + 3] =
-        predictor[1 * 16 + 2] =
-          predictor[2 * 16 + 1] =
-            predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
-      predictor[1 * 16 + 3] =
-        predictor[2 * 16 + 2] =
-          predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
-      predictor[2 * 16 + 3] =
-        predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
-      predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
-
-    }
-    break;
-    case B_RD_PRED: {
-      uint8_t pp[9];
-
-      pp[0] = left[3];
-      pp[1] = left[2];
-      pp[2] = left[1];
-      pp[3] = left[0];
-      pp[4] = top_left;
-      pp[5] = above[0];
-      pp[6] = above[1];
-      pp[7] = above[2];
-      pp[8] = above[3];
-
-      predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-      predictor[3 * 16 + 1] =
-        predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-      predictor[3 * 16 + 2] =
-        predictor[2 * 16 + 1] =
-          predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-      predictor[3 * 16 + 3] =
-        predictor[2 * 16 + 2] =
-          predictor[1 * 16 + 1] =
-            predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-      predictor[2 * 16 + 3] =
-        predictor[1 * 16 + 2] =
-          predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-      predictor[1 * 16 + 3] =
-        predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-      predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
-
-    }
-    break;
-    case B_VR_PRED: {
-      uint8_t pp[9];
-
-      pp[0] = left[3];
-      pp[1] = left[2];
-      pp[2] = left[1];
-      pp[3] = left[0];
-      pp[4] = top_left;
-      pp[5] = above[0];
-      pp[6] = above[1];
-      pp[7] = above[2];
-      pp[8] = above[3];
-
-      predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-      predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-      predictor[3 * 16 + 1] =
-        predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-      predictor[2 * 16 + 1] =
-        predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1;
-      predictor[3 * 16 + 2] =
-        predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-      predictor[2 * 16 + 2] =
-        predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1;
-      predictor[3 * 16 + 3] =
-        predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-      predictor[2 * 16 + 3] =
-        predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1;
-      predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
-      predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1;
-
-    }
-    break;
-    case B_VL_PRED: {
-      uint8_t *pp = above;
-
-      predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
-      predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-      predictor[2 * 16 + 0] =
-        predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1;
-      predictor[1 * 16 + 1] =
-        predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-      predictor[2 * 16 + 1] =
-        predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1;
-      predictor[3 * 16 + 1] =
-        predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-      predictor[0 * 16 + 3] =
-        predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1;
-      predictor[1 * 16 + 3] =
-        predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-      predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-      predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-    }
-    break;
-
-    case B_HD_PRED: {
-      uint8_t pp[9];
-
-      pp[0] = left[3];
-      pp[1] = left[2];
-      pp[2] = left[1];
-      pp[3] = left[0];
-      pp[4] = top_left;
-      pp[5] = above[0];
-      pp[6] = above[1];
-      pp[7] = above[2];
-      pp[8] = above[3];
-
-
-      predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
-      predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-      predictor[2 * 16 + 0] =
-        predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1;
-      predictor[2 * 16 + 1] =
-        predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-      predictor[2 * 16 + 2] =
-        predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
-      predictor[2 * 16 + 3] =
-        predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-      predictor[1 * 16 + 2] =
-        predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1;
-      predictor[1 * 16 + 3] =
-        predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-      predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-      predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-    }
-    break;
-
-
-    case B_HU_PRED: {
-      uint8_t *pp = left;
-      predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
-      predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-      predictor[0 * 16 + 2] =
-        predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1;
-      predictor[0 * 16 + 3] =
-        predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-      predictor[1 * 16 + 2] =
-        predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
-      predictor[1 * 16 + 3] =
-        predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
-      predictor[2 * 16 + 2] =
-        predictor[2 * 16 + 3] =
-          predictor[3 * 16 + 0] =
-            predictor[3 * 16 + 1] =
-              predictor[3 * 16 + 2] =
-                predictor[3 * 16 + 3] = pp[3];
-    }
-    break;
-
-#if CONFIG_NEWBINTRAMODES
-    case B_CONTEXT_PRED:
-    break;
-    /*
-    case B_CORNER_PRED:
-    corner_predictor(predictor, 16, 4, above, left);
-    break;
-    */
-#endif
-  }
-}
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -5,14 +5,13 @@
  */
 
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_enums.h"
 
 struct loop_filter_info;
-struct blockd;
 struct macroblockd;
 struct loop_filter_info;
 
 /* Encoder forward decls */
-struct block;
 struct macroblock;
 struct vp9_variance_vtable;
 
@@ -26,33 +25,27 @@
 #
 # Dequant
 #
-prototype void vp9_dequant_idct_add_y_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, struct macroblockd *xd"
-specialize vp9_dequant_idct_add_y_block_8x8
+prototype void vp9_idct_add_y_block_8x8 "int16_t *q, uint8_t *dst, int stride, struct macroblockd *xd"
+specialize vp9_idct_add_y_block_8x8
 
-prototype void vp9_dequant_idct_add_uv_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, struct macroblockd *xd"
-specialize vp9_dequant_idct_add_uv_block_8x8
+prototype void vp9_idct_add_16x16 "int16_t *input, uint8_t *dest, int stride, int eob"
+specialize vp9_idct_add_16x16
 
-prototype void vp9_dequant_idct_add_16x16 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob"
-specialize vp9_dequant_idct_add_16x16
+prototype void vp9_idct_add_8x8 "int16_t *input, uint8_t *dest, int stride, int eob"
+specialize vp9_idct_add_8x8
 
-prototype void vp9_dequant_idct_add_8x8 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob"
-specialize vp9_dequant_idct_add_8x8
+prototype void vp9_idct_add "int16_t *input, uint8_t *dest, int stride, int eob"
+specialize vp9_idct_add
 
-prototype void vp9_dequant_idct_add "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob"
-specialize vp9_dequant_idct_add
+prototype void vp9_idct_add_y_block "int16_t *q, uint8_t *dst, int stride, struct macroblockd *xd"
+specialize vp9_idct_add_y_block
 
-prototype void vp9_dequant_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, struct macroblockd *xd"
-specialize vp9_dequant_idct_add_y_block
+prototype void vp9_idct_add_uv_block "int16_t *q, uint8_t *dst, int stride, uint16_t *eobs"
+specialize vp9_idct_add_uv_block
 
-prototype void vp9_dequant_idct_add_uv_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, struct macroblockd *xd"
-specialize vp9_dequant_idct_add_uv_block
+prototype void vp9_idct_add_32x32 "int16_t *q, uint8_t *dst, int stride, int eob"
+specialize vp9_idct_add_32x32
 
-prototype void vp9_dequant_idct_add_32x32 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int pitch, int stride, int eob"
-specialize vp9_dequant_idct_add_32x32
-
-prototype void vp9_dequant_idct_add_uv_block_16x16 "int16_t *q, const int16_t *dq, uint8_t *dstu, uint8_t *dstv, int stride, struct macroblockd *xd"
-specialize vp9_dequant_idct_add_uv_block_16x16
-
 #
 # RECON
 #
@@ -67,98 +60,26 @@
 prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem8x4 mmx
 
-prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
-specialize vp9_recon_b
+prototype void vp9_build_intra_predictors "uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available"
+specialize void vp9_build_intra_predictors
 
-prototype void vp9_recon_uv_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
-specialize vp9_recon_uv_b
+prototype void vp9_build_intra_predictors_sby_s "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize"
+specialize vp9_build_intra_predictors_sby_s
 
-prototype void vp9_recon2b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
-specialize vp9_recon2b sse2
+prototype void vp9_build_intra_predictors_sbuv_s "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize"
+specialize vp9_build_intra_predictors_sbuv_s
 
-prototype void vp9_recon4b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
-specialize vp9_recon4b sse2
-
-prototype void vp9_recon_mb "struct macroblockd *x"
-specialize vp9_recon_mb
-
-prototype void vp9_recon_mby "struct macroblockd *x"
-specialize vp9_recon_mby
-
-prototype void vp9_recon_mby_s "struct macroblockd *x, uint8_t *dst"
-specialize vp9_recon_mby_s
-
-prototype void vp9_recon_mbuv_s "struct macroblockd *x, uint8_t *udst, uint8_t *vdst"
-specialize void vp9_recon_mbuv_s
-
-prototype void vp9_recon_sby_s "struct macroblockd *x, uint8_t *dst"
-specialize vp9_recon_sby_s
-
-prototype void vp9_recon_sbuv_s "struct macroblockd *x, uint8_t *udst, uint8_t *vdst"
-specialize void vp9_recon_sbuv_s
-
-prototype void vp9_recon_sb64y_s "struct macroblockd *x, uint8_t *dst"
-specialize vp9_recon_sb64y_s
-
-prototype void vp9_recon_sb64uv_s "struct macroblockd *x, uint8_t *udst, uint8_t *vdst"
-specialize void vp9_recon_sb64uv_s
-
-prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"
-specialize vp9_build_intra_predictors_mby_s
-
-prototype void vp9_build_intra_predictors_sby_s "struct macroblockd *x"
-specialize vp9_build_intra_predictors_sby_s;
-
-prototype void vp9_build_intra_predictors_sbuv_s "struct macroblockd *x"
-specialize vp9_build_intra_predictors_sbuv_s;
-
-prototype void vp9_build_intra_predictors_mby "struct macroblockd *x"
-specialize vp9_build_intra_predictors_mby;
-
-prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"
-specialize vp9_build_intra_predictors_mby_s;
-
-prototype void vp9_build_intra_predictors_mbuv "struct macroblockd *x"
-specialize vp9_build_intra_predictors_mbuv;
-
-prototype void vp9_build_intra_predictors_mbuv_s "struct macroblockd *x"
-specialize vp9_build_intra_predictors_mbuv_s;
-
-prototype void vp9_build_intra_predictors_sb64y_s "struct macroblockd *x"
-specialize vp9_build_intra_predictors_sb64y_s;
-
-prototype void vp9_build_intra_predictors_sb64uv_s "struct macroblockd *x"
-specialize vp9_build_intra_predictors_sb64uv_s;
-
-prototype void vp9_intra4x4_predict "struct macroblockd *xd, struct blockd *x, int b_mode, uint8_t *predictor"
+prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride"
 specialize vp9_intra4x4_predict;
 
-prototype void vp9_intra8x8_predict "struct macroblockd *xd, struct blockd *x, int b_mode, uint8_t *predictor"
-specialize vp9_intra8x8_predict;
-
-prototype void vp9_intra_uv4x4_predict "struct macroblockd *xd, struct blockd *x, int b_mode, uint8_t *predictor"
-specialize vp9_intra_uv4x4_predict;
-
 if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
-prototype void vp9_add_residual_4x4 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"
-specialize vp9_add_residual_4x4 sse2
-
-prototype void vp9_add_residual_8x8 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"
-specialize vp9_add_residual_8x8 sse2
-
-prototype void vp9_add_residual_16x16 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"
-specialize vp9_add_residual_16x16 sse2
-
-prototype void vp9_add_residual_32x32 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"
-specialize vp9_add_residual_32x32 sse2
-
-prototype void vp9_add_constant_residual_8x8 "const int16_t diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"
+prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
 specialize vp9_add_constant_residual_8x8 sse2
 
-prototype void vp9_add_constant_residual_16x16 "const int16_t diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"
+prototype void vp9_add_constant_residual_16x16 "const int16_t diff, uint8_t *dest, int stride"
 specialize vp9_add_constant_residual_16x16 sse2
 
-prototype void vp9_add_constant_residual_32x32 "const int16_t diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"
+prototype void vp9_add_constant_residual_32x32 "const int16_t diff, uint8_t *dest, int stride"
 specialize vp9_add_constant_residual_32x32 sse2
 fi
 
@@ -165,54 +86,24 @@
 #
 # Loopfilter
 #
-prototype void vp9_loop_filter_mbv "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp9_loop_filter_mbv sse2
+prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
+specialize vp9_mb_lpf_vertical_edge_w
 
-prototype void vp9_loop_filter_bv "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp9_loop_filter_bv sse2
+prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
+specialize vp9_mbloop_filter_vertical_edge
 
-prototype void vp9_loop_filter_bv8x8 "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp9_loop_filter_bv8x8 sse2
+prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
+specialize vp9_loop_filter_vertical_edge
 
-prototype void vp9_loop_filter_mbh "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp9_loop_filter_mbh sse2
+prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
+specialize vp9_mb_lpf_horizontal_edge_w
 
-prototype void vp9_loop_filter_bh "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp9_loop_filter_bh sse2
+prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
+specialize vp9_mbloop_filter_horizontal_edge
 
-prototype void vp9_loop_filter_bh8x8 "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp9_loop_filter_bh8x8 sse2
+prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
+specialize vp9_loop_filter_horizontal_edge
 
-prototype void vp9_loop_filter_simple_mbv "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_mbv mmx sse2
-vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c
-vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx
-vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2
-
-prototype void vp9_loop_filter_simple_mbh "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_mbh mmx sse2
-vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c
-vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx
-vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2
-
-prototype void vp9_loop_filter_simple_bv "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_bv mmx sse2
-vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c
-vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx
-vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2
-
-prototype void vp9_loop_filter_simple_bh "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_bh mmx sse2
-vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c
-vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx
-vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2
-
-prototype void vp9_lpf_mbh_w "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp9_lpf_mbh_w sse2
-
-prototype void vp9_lpf_mbv_w "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp9_lpf_mbv_w sse2
-
 #
 # post proc
 #
@@ -225,7 +116,7 @@
 specialize vp9_mbpost_proc_across_ip sse2
 vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm
 
-prototype void vp9_post_proc_down_and_across "uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit"
+prototype void vp9_post_proc_down_and_across "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit"
 specialize vp9_post_proc_down_and_across mmx sse2
 vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm
 
@@ -244,18 +135,6 @@
 specialize vp9_blend_b
 
 #
-# sad 16x3, 3x16
-#
-prototype unsigned int vp9_sad16x3 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride"
-specialize vp9_sad16x3 sse2
-
-prototype unsigned int vp9_sad3x16 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride"
-specialize vp9_sad3x16 sse2
-
-prototype unsigned int vp9_sub_pixel_variance16x2 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x2 sse2
-
-#
 # Sub Pixel Filters
 #
 prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
@@ -276,123 +155,64 @@
 prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8_avg_vert ssse3
 
-#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT
-prototype void vp9_convolve8_1by8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_1by8
-
-prototype void vp9_convolve8_qtr "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_qtr
-
-prototype void vp9_convolve8_3by8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_3by8
-
-prototype void vp9_convolve8_5by8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_5by8
-
-prototype void vp9_convolve8_3qtr "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_3qtr
-
-prototype void vp9_convolve8_7by8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_7by8
-
-prototype void vp9_convolve8_1by8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_1by8_horiz
-
-prototype void vp9_convolve8_qtr_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_qtr_horiz
-
-prototype void vp9_convolve8_3by8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_3by8_horiz
-
-prototype void vp9_convolve8_5by8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_5by8_horiz
-
-prototype void vp9_convolve8_3qtr_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_3qtr_horiz
-
-prototype void vp9_convolve8_7by8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_7by8_horiz
-
-prototype void vp9_convolve8_1by8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_1by8_vert
-
-prototype void vp9_convolve8_qtr_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_qtr_vert
-
-prototype void vp9_convolve8_3by8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_3by8_vert
-
-prototype void vp9_convolve8_5by8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_5by8_vert
-
-prototype void vp9_convolve8_3qtr_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_3qtr_vert
-
-prototype void vp9_convolve8_7by8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_7by8_vert
-#endif
-
 #
 # dct
 #
-prototype void vp9_short_idct4x4_1 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct4x4_1
+prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct4x4_1_add
 
-prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct4x4 sse2
+prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct4x4_add sse2
 
-prototype void vp9_short_idct8x8 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct8x8 sse2
+prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct8x8_add sse2
 
-prototype void vp9_short_idct10_8x8 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct10_8x8 sse2
+prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct10_8x8_add sse2
 
 prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_8x8
 
-prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct16x16 sse2
+prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct16x16_add sse2
 
-prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct10_16x16 sse2
+prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct10_16x16_add sse2
 
 prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_16x16
 
+prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct32x32_add sse2
 
-prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct32x32 sse2
-
 prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_32x32
 
-prototype void vp9_short_idct10_32x32 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct10_32x32
+prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct10_32x32_add
 
-prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type"
-specialize vp9_short_iht8x8
+prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+specialize vp9_short_iht4x4_add
 
-prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
-specialize vp9_short_iht4x4
+prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+specialize vp9_short_iht8x8_add
 
-prototype void vp9_short_iht16x16 "int16_t *input, int16_t *output, int pitch, int tx_type"
-specialize vp9_short_iht16x16
+prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
+specialize vp9_short_iht16x16_add
 
 prototype void vp9_idct4_1d "int16_t *input, int16_t *output"
 specialize vp9_idct4_1d sse2
-
 # dct and add
 
 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
 specialize vp9_dc_only_idct_add sse2
 
-prototype void vp9_short_iwalsh4x4_1 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_iwalsh4x4_1
-prototype void vp9_short_iwalsh4x4 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_iwalsh4x4
-prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
-specialize vp9_dc_only_inv_walsh_add
+prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_iwalsh4x4_1_add
 
+prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_iwalsh4x4_add
+
 prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"
 specialize vp9_sad32x3
 
@@ -408,66 +228,148 @@
 # variance
 [ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2
 
+prototype unsigned int vp9_variance32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance32x16 sse2
+
+prototype unsigned int vp9_variance16x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance16x32 sse2
+
+prototype unsigned int vp9_variance64x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance64x32 sse2
+
+prototype unsigned int vp9_variance32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance32x64 sse2
+
 prototype unsigned int vp9_variance32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance32x32
+specialize vp9_variance32x32 sse2
 
 prototype unsigned int vp9_variance64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance64x64
+specialize vp9_variance64x64 sse2
 
 prototype unsigned int vp9_variance16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance16x16 mmx sse2
-vp9_variance16x16_sse2=vp9_variance16x16_wmt
-vp9_variance16x16_mmx=vp9_variance16x16_mmx
 
 prototype unsigned int vp9_variance16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance16x8 mmx sse2
-vp9_variance16x8_sse2=vp9_variance16x8_wmt
-vp9_variance16x8_mmx=vp9_variance16x8_mmx
 
 prototype unsigned int vp9_variance8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance8x16 mmx sse2
-vp9_variance8x16_sse2=vp9_variance8x16_wmt
-vp9_variance8x16_mmx=vp9_variance8x16_mmx
 
 prototype unsigned int vp9_variance8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance8x8 mmx sse2
-vp9_variance8x8_sse2=vp9_variance8x8_wmt
-vp9_variance8x8_mmx=vp9_variance8x8_mmx
 
+prototype void vp9_get_sse_sum_8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"
+specialize vp9_get_sse_sum_8x8 sse2
+vp9_get_sse_sum_8x8_sse2=vp9_get8x8var_sse2
+
+prototype unsigned int vp9_variance8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance8x4 sse2
+
+prototype unsigned int vp9_variance4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance4x8 sse2
+
 prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance4x4 mmx sse2
-vp9_variance4x4_sse2=vp9_variance4x4_wmt
-vp9_variance4x4_mmx=vp9_variance4x4_mmx
 
 prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance64x64 sse2
 
+prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance64x64
+
+prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_sub_pixel_variance32x64
+
+prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x64
+
+prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_sub_pixel_variance64x32
+
+prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance64x32
+
+prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_sub_pixel_variance32x16
+
+prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x16
+
+prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_sub_pixel_variance16x32
+
+prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x32
+
 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x32 sse2
 
+prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x32
+
 prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
 
+prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x16
+
 prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x16 sse2 mmx
 vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
 
+prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance8x16
+
 prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
 
+prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x8
+
 prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x8 sse2 mmx
 vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
 
+prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance8x8
+
+# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
+prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_sub_pixel_variance8x4
+
+prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance8x4
+
+prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_sub_pixel_variance4x8
+
+prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance4x8
+
 prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance4x4 sse2 mmx
 vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
 
+prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance4x4
+
 prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad64x64 sse2
 
+prototype unsigned int vp9_sad32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp9_sad32x64 sse2
+
+prototype unsigned int vp9_sad64x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp9_sad64x32 sse2
+
+prototype unsigned int vp9_sad32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp9_sad32x16 sse2
+
+prototype unsigned int vp9_sad16x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp9_sad16x32 sse2
+
 prototype unsigned int vp9_sad32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad32x32 sse2
 
@@ -483,6 +385,13 @@
 prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad8x8 mmx sse2
 
+# TODO(jingning): need to covert these functions into mmx/sse2 form
+prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp9_sad8x4
+
+prototype unsigned int vp9_sad4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp9_sad4x8
+
 prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad4x4 mmx sse
 
@@ -555,6 +464,12 @@
 prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad8x8x8 sse4
 
+prototype void vp9_sad8x4x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"
+specialize vp9_sad8x4x8
+
+prototype void vp9_sad4x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"
+specialize vp9_sad4x8x8
+
 prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
 specialize vp9_sad4x4x8 sse4
 
@@ -561,6 +476,18 @@
 prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad64x64x4d sse2
 
+prototype void vp9_sad32x64x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad32x64x4d sse2
+
+prototype void vp9_sad64x32x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad64x32x4d sse2
+
+prototype void vp9_sad32x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad32x16x4d sse2
+
+prototype void vp9_sad16x32x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad16x32x4d sse2
+
 prototype void vp9_sad32x32x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad32x32x4d sse2
 
@@ -576,6 +503,13 @@
 prototype void vp9_sad8x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad8x8x4d sse2
 
+# TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
+prototype void vp9_sad8x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
+specialize vp9_sad8x4x4d
+
+prototype void vp9_sad4x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
+specialize vp9_sad4x8x4d
+
 prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad4x4x4d sse
 prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
@@ -585,6 +519,15 @@
 specialize vp9_mse16x16 mmx sse2
 vp9_mse16x16_sse2=vp9_mse16x16_wmt
 
+prototype unsigned int vp9_mse8x16 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"
+specialize vp9_mse8x16
+
+prototype unsigned int vp9_mse16x8 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"
+specialize vp9_mse16x8
+
+prototype unsigned int vp9_mse8x8 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"
+specialize vp9_mse8x8
+
 prototype unsigned int vp9_sub_pixel_mse64x64 "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_mse64x64
 
@@ -594,30 +537,11 @@
 prototype unsigned int vp9_get_mb_ss "const int16_t *"
 specialize vp9_get_mb_ss mmx sse2
 # ENCODEMB INVOKE
-prototype int vp9_mbblock_error "struct macroblock *mb"
-specialize vp9_mbblock_error mmx sse2
-vp9_mbblock_error_sse2=vp9_mbblock_error_xmm
 
 prototype int vp9_block_error "int16_t *coeff, int16_t *dqcoeff, int block_size"
 specialize vp9_block_error mmx sse2
 vp9_block_error_sse2=vp9_block_error_xmm
 
-prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"
-specialize vp9_subtract_b mmx sse2
-
-prototype int vp9_mbuverror "struct macroblock *mb"
-specialize vp9_mbuverror mmx sse2
-vp9_mbuverror_sse2=vp9_mbuverror_xmm
-
-prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"
-specialize vp9_subtract_b mmx sse2
-
-prototype void vp9_subtract_mby "int16_t *diff, uint8_t *src, uint8_t *pred, int stride"
-specialize vp9_subtract_mby mmx sse2
-
-prototype void vp9_subtract_mbuv "int16_t *diff, uint8_t *usrc, uint8_t *vsrc, uint8_t *pred, int stride"
-specialize vp9_subtract_mbuv mmx sse2
-
 #
 # Structured Similarity (SSIM)
 #
@@ -665,16 +589,16 @@
 #
 # Motion search
 #
-prototype int vp9_full_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"
+prototype int vp9_full_search_sad "struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n"
 specialize vp9_full_search_sad sse3 sse4_1
 vp9_full_search_sad_sse3=vp9_full_search_sadx3
 vp9_full_search_sad_sse4_1=vp9_full_search_sadx8
 
-prototype int vp9_refining_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"
+prototype int vp9_refining_search_sad "struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"
 specialize vp9_refining_search_sad sse3
 vp9_refining_search_sad_sse3=vp9_refining_search_sadx4
 
-prototype int vp9_diamond_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"
+prototype int vp9_diamond_search_sad "struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"
 specialize vp9_diamond_search_sad sse3
 vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4
 
--- a/vp9/common/vp9_seg_common.c
+++ b/vp9/common/vp9_seg_common.c
@@ -12,8 +12,8 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_seg_common.h"
 
-static const int segfeaturedata_signed[SEG_LVL_MAX] = { 1, 1, 0, 0 };
-static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ, 63, 0xf, 0xf };
+static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 0, 0 };
+static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ, 63, 3, 0 };
 
 // These functions provide access to new segment level features.
 // Eventually these function may be "optimized out" but for the moment,
@@ -20,13 +20,10 @@
 // the coding mechanism is still subject to change so these provide a
 // convenient single point of change.
 
-int vp9_segfeature_active(const MACROBLOCKD *xd,
-                          int segment_id,
+int vp9_segfeature_active(const MACROBLOCKD *xd, int segment_id,
                           SEG_LVL_FEATURES feature_id) {
-  // Return true if mask bit set and segmentation enabled.
-  return (xd->segmentation_enabled &&
-          (xd->segment_feature_mask[segment_id] &
-           (0x01 << feature_id)));
+  return xd->segmentation_enabled &&
+         (xd->segment_feature_mask[segment_id] & (1 << feature_id));
 }
 
 void vp9_clearall_segfeatures(MACROBLOCKD *xd) {
@@ -34,14 +31,12 @@
   vpx_memset(xd->segment_feature_mask, 0, sizeof(xd->segment_feature_mask));
 }
 
-void vp9_enable_segfeature(MACROBLOCKD *xd,
-                           int segment_id,
+void vp9_enable_segfeature(MACROBLOCKD *xd, int segment_id,
                            SEG_LVL_FEATURES feature_id) {
-  xd->segment_feature_mask[segment_id] |= (0x01 << feature_id);
+  xd->segment_feature_mask[segment_id] |= 1 << feature_id;
 }
 
-void vp9_disable_segfeature(MACROBLOCKD *xd,
-                            int segment_id,
+void vp9_disable_segfeature(MACROBLOCKD *xd, int segment_id,
                             SEG_LVL_FEATURES feature_id) {
   xd->segment_feature_mask[segment_id] &= ~(1 << feature_id);
 }
@@ -51,22 +46,19 @@
 }
 
 int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
-  return segfeaturedata_signed[feature_id];
+  return seg_feature_data_signed[feature_id];
 }
 
-void vp9_clear_segdata(MACROBLOCKD *xd,
-                       int segment_id,
+void vp9_clear_segdata(MACROBLOCKD *xd, int segment_id,
                        SEG_LVL_FEATURES feature_id) {
   xd->segment_feature_data[segment_id][feature_id] = 0;
 }
 
-void vp9_set_segdata(MACROBLOCKD *xd,
-                     int segment_id,
-                     SEG_LVL_FEATURES feature_id,
-                     int seg_data) {
+void vp9_set_segdata(MACROBLOCKD *xd, int segment_id,
+                     SEG_LVL_FEATURES feature_id, int seg_data) {
   assert(seg_data <= seg_feature_data_max[feature_id]);
   if (seg_data < 0) {
-    assert(segfeaturedata_signed[feature_id]);
+    assert(seg_feature_data_signed[feature_id]);
     assert(-seg_data <= seg_feature_data_max[feature_id]);
   }
 
@@ -73,33 +65,16 @@
   xd->segment_feature_data[segment_id][feature_id] = seg_data;
 }
 
-int vp9_get_segdata(const MACROBLOCKD *xd,
-                    int segment_id,
+int vp9_get_segdata(const MACROBLOCKD *xd, int segment_id,
                     SEG_LVL_FEATURES feature_id) {
   return xd->segment_feature_data[segment_id][feature_id];
 }
 
-void vp9_clear_segref(MACROBLOCKD *xd, int segment_id) {
-  xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] = 0;
-}
 
-void vp9_set_segref(MACROBLOCKD *xd,
-                    int segment_id,
-                    MV_REFERENCE_FRAME ref_frame) {
-  xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] |=
-    (1 << ref_frame);
-}
+const vp9_tree_index vp9_segment_tree[14] = {
+  2,  4,  6,  8, 10, 12,
+  0, -1, -2, -3, -4, -5, -6, -7
+};
 
-int vp9_check_segref(const MACROBLOCKD *xd,
-                     int segment_id,
-                     MV_REFERENCE_FRAME ref_frame) {
-  return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] &
-          (1 << ref_frame)) ? 1 : 0;
-}
-
-int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id) {
-  return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] &
-          ~(1 << INTRA_FRAME)) ? 1 : 0;
-}
 
 // TBD? Functions to read and write segment data with range / validity checking
--- a/vp9/common/vp9_seg_common.h
+++ b/vp9/common/vp9_seg_common.h
@@ -45,17 +45,7 @@
                     int segment_id,
                     SEG_LVL_FEATURES feature_id);
 
-void vp9_clear_segref(MACROBLOCKD *xd, int segment_id);
-
-void vp9_set_segref(MACROBLOCKD *xd,
-                    int segment_id,
-                    MV_REFERENCE_FRAME ref_frame);
-
-int vp9_check_segref(const MACROBLOCKD *xd,
-                     int segment_id,
-                     MV_REFERENCE_FRAME ref_frame);
-
-int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id);
+extern const vp9_tree_index vp9_segment_tree[14];
 
 #endif  // VP9_COMMON_VP9_SEG_COMMON_H_
 
--- a/vp9/common/vp9_setupintrarecon.c
+++ /dev/null
@@ -1,31 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9/common/vp9_setupintrarecon.h"
-#include "vpx_mem/vpx_mem.h"
-
-void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf) {
-  int i;
-
-  /* set up frame new frame for intra coded blocks */
-  vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
-  for (i = 0; i < ybf->y_height; i++)
-    ybf->y_buffer[ybf->y_stride * i - 1] = (uint8_t) 129;
-
-  vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
-  for (i = 0; i < ybf->uv_height; i++)
-    ybf->u_buffer[ybf->uv_stride * i - 1] = (uint8_t) 129;
-
-  vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
-  for (i = 0; i < ybf->uv_height; i++)
-    ybf->v_buffer[ybf->uv_stride * i - 1] = (uint8_t) 129;
-
-}
--- a/vp9/common/vp9_setupintrarecon.h
+++ /dev/null
@@ -1,18 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_SETUPINTRARECON_H_
-#define VP9_COMMON_VP9_SETUPINTRARECON_H_
-
-#include "vpx_scale/yv12config.h"
-
-void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
-
-#endif  // VP9_COMMON_VP9_SETUPINTRARECON_H_
--- a/vp9/common/vp9_swapyv12buffer.c
+++ /dev/null
@@ -1,32 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp9/common/vp9_swapyv12buffer.h"
-
-void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,
-                          YV12_BUFFER_CONFIG *last_frame) {
-  uint8_t *temp;
-
-  temp = last_frame->buffer_alloc;
-  last_frame->buffer_alloc = new_frame->buffer_alloc;
-  new_frame->buffer_alloc = temp;
-
-  temp = last_frame->y_buffer;
-  last_frame->y_buffer = new_frame->y_buffer;
-  new_frame->y_buffer = temp;
-
-  temp = last_frame->u_buffer;
-  last_frame->u_buffer = new_frame->u_buffer;
-  new_frame->u_buffer = temp;
-
-  temp = last_frame->v_buffer;
-  last_frame->v_buffer = new_frame->v_buffer;
-  new_frame->v_buffer = temp;
-}
--- a/vp9/common/vp9_swapyv12buffer.h
+++ /dev/null
@@ -1,19 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_SWAPYV12BUFFER_H_
-#define VP9_COMMON_VP9_SWAPYV12BUFFER_H_
-
-#include "vpx_scale/yv12config.h"
-
-void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,
-                          YV12_BUFFER_CONFIG *last_frame);
-
-#endif  // VP9_COMMON_VP9_SWAPYV12BUFFER_H_
--- a/vp9/common/vp9_tile_common.c
+++ b/vp9/common/vp9_tile_common.c
@@ -17,27 +17,27 @@
 
 static void vp9_get_tile_offsets(VP9_COMMON *cm, int *min_tile_off,
                                  int *max_tile_off, int tile_idx,
-                                 int log2_n_tiles, int n_mbs) {
-  const int n_sbs = (n_mbs + 3) >> 2;
+                                 int log2_n_tiles, int n_mis) {
+  const int n_sbs = (n_mis + 7) >> 3;
   const int sb_off1 =  (tile_idx      * n_sbs) >> log2_n_tiles;
   const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles;
 
-  *min_tile_off = MIN(sb_off1 << 2, n_mbs);
-  *max_tile_off = MIN(sb_off2 << 2, n_mbs);
+  *min_tile_off = MIN(sb_off1 << 3, n_mis);
+  *max_tile_off = MIN(sb_off2 << 3, n_mis);
 }
 
 void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx) {
   cm->cur_tile_col_idx = tile_col_idx;
-  vp9_get_tile_offsets(cm, &cm->cur_tile_mb_col_start,
-                       &cm->cur_tile_mb_col_end, tile_col_idx,
-                       cm->log2_tile_columns, cm->mb_cols);
+  vp9_get_tile_offsets(cm, &cm->cur_tile_mi_col_start,
+                       &cm->cur_tile_mi_col_end, tile_col_idx,
+                       cm->log2_tile_columns, cm->mi_cols);
 }
 
 void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx) {
   cm->cur_tile_row_idx = tile_row_idx;
-  vp9_get_tile_offsets(cm, &cm->cur_tile_mb_row_start,
-                       &cm->cur_tile_mb_row_end, tile_row_idx,
-                       cm->log2_tile_rows, cm->mb_rows);
+  vp9_get_tile_offsets(cm, &cm->cur_tile_mi_row_start,
+                       &cm->cur_tile_mi_row_end, tile_row_idx,
+                       cm->log2_tile_rows, cm->mi_rows);
 }
 
 
@@ -49,10 +49,15 @@
   for (max_log2_n_tiles = 0;
        (sb_cols >> max_log2_n_tiles) >= MIN_TILE_WIDTH_SBS;
        max_log2_n_tiles++) {}
+  max_log2_n_tiles--;
+  if (max_log2_n_tiles <  0)
+    max_log2_n_tiles = 0;
+
   for (min_log2_n_tiles = 0;
        (MAX_TILE_WIDTH_SBS << min_log2_n_tiles) < sb_cols;
        min_log2_n_tiles++) {}
 
+  assert(max_log2_n_tiles >= min_log2_n_tiles);
   *min_log2_n_tiles_ptr = min_log2_n_tiles;
   *delta_log2_n_tiles = max_log2_n_tiles - min_log2_n_tiles;
 }
--- a/vp9/common/vp9_treecoder.c
+++ b/vp9/common/vp9_treecoder.c
@@ -14,19 +14,13 @@
 #if defined(CONFIG_DEBUG) && CONFIG_DEBUG
 #include <assert.h>
 #endif
-#include <stdio.h>
 
 #include "vp9/common/vp9_treecoder.h"
 
-static void tree2tok(
-  struct vp9_token_struct *const p,
-  vp9_tree t,
-  int i,
-  int v,
-  int L
-) {
+static void tree2tok(struct vp9_token *const p, vp9_tree t,
+                    int i, int v, int l) {
   v += v;
-  ++L;
+  ++l;
 
   do {
     const vp9_tree_index j = t[i++];
@@ -33,17 +27,17 @@
 
     if (j <= 0) {
       p[-j].value = v;
-      p[-j].Len = L;
+      p[-j].len = l;
     } else
-      tree2tok(p, t, j, v, L);
+      tree2tok(p, t, j, v, l);
   } while (++v & 1);
 }
 
-void vp9_tokens_from_tree(struct vp9_token_struct *p, vp9_tree t) {
+void vp9_tokens_from_tree(struct vp9_token *p, vp9_tree t) {
   tree2tok(p, t, 0, 0, 0);
 }
 
-void vp9_tokens_from_tree_offset(struct vp9_token_struct *p, vp9_tree t,
+void vp9_tokens_from_tree_offset(struct vp9_token *p, vp9_tree t,
                                  int offset) {
   tree2tok(p - offset, t, 0, 0, 0);
 }
@@ -62,12 +56,12 @@
     left = convert_distribution(tree[i], tree, probs, branch_ct,
                                 num_events, tok0_offset);
   }
-  if (tree[i + 1] <= 0) {
+  if (tree[i + 1] <= 0)
     right = num_events[-tree[i + 1] - tok0_offset];
-  } else {
+  else
     right = convert_distribution(tree[i + 1], tree, probs, branch_ct,
-                                num_events, tok0_offset);
-  }
+                                 num_events, tok0_offset);
+
   probs[i>>1] = get_binary_prob(left, right);
   branch_ct[i>>1][0] = left;
   branch_ct[i>>1][1] = right;
--- a/vp9/common/vp9_treecoder.h
+++ b/vp9/common/vp9_treecoder.h
@@ -13,6 +13,7 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
 
 typedef uint8_t vp9_prob;
 
@@ -31,16 +32,15 @@
 
 typedef const vp9_tree_index vp9_tree[], *vp9_tree_p;
 
-typedef const struct vp9_token_struct {
+struct vp9_token {
   int value;
-  int Len;
-} vp9_token;
+  int len;
+};
 
 /* Construct encoding array from tree. */
 
-void vp9_tokens_from_tree(struct vp9_token_struct *, vp9_tree);
-void vp9_tokens_from_tree_offset(struct vp9_token_struct *, vp9_tree,
-                                 int offset);
+void vp9_tokens_from_tree(struct vp9_token*, vp9_tree);
+void vp9_tokens_from_tree_offset(struct vp9_token*, vp9_tree, int offset);
 
 /* Convert array of token occurrence counts into a table of probabilities
    for the associated binary encoding tree.  Also writes count of branches
@@ -76,7 +76,7 @@
 
 /* this function assumes prob1 and prob2 are already within [1,255] range */
 static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) {
-  return (prob1 * (256 - factor) + prob2 * factor + 128) >> 8;
+  return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
 }
 
 #endif  // VP9_COMMON_VP9_TREECODER_H_
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -278,45 +278,20 @@
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71);
 
-  // check w/h due to fixed size fdata2 array
-  assert(w <= 16);
-  assert(h <= 16);
-
-  if (x_step_q4 == 16 && y_step_q4 == 16 &&
-      filter_x[3] != 128 && filter_y[3] != 128) {
-    if (w == 16) {
-      vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,
-                                    fdata2, 16,
-                                    h + 7, filter_x);
-      vp9_filter_block1d16_v8_ssse3(fdata2, 16,
-                                    dst, dst_stride,
-                                    h, filter_y);
-      return;
-    }
-    if (w == 8) {
-      vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,
-                                   fdata2, 16,
-                                   h + 7, filter_x);
-      vp9_filter_block1d8_v8_ssse3(fdata2, 16,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      return;
-    }
-    if (w == 4) {
-      vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride,
-                                   fdata2, 16,
-                                   h + 7, filter_x);
-      vp9_filter_block1d4_v8_ssse3(fdata2, 16,
-                                   dst, dst_stride,
-                                   h, filter_y);
-      return;
-    }
+  assert(w <= 64);
+  assert(h <= 64);
+  if (x_step_q4 == 16 && y_step_q4 == 16) {
+    vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h + 7);
+    vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
+                             filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+  } else {
+    vp9_convolve8_c(src, src_stride, dst, dst_stride,
+                    filter_x, x_step_q4, filter_y, y_step_q4, w, h);
   }
-  vp9_convolve8_c(src, src_stride, dst, dst_stride,
-                  filter_x, x_step_q4, filter_y, y_step_q4,
-                  w, h);
 }
 
 void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,
@@ -324,44 +299,20 @@
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71);
 
-  // check w/h due to fixed size fdata2 array
-  assert(w <= 16);
-  assert(h <= 16);
-
-  if (x_step_q4 == 16 && y_step_q4 == 16 &&
-      filter_x[3] != 128 && filter_y[3] != 128) {
-    if (w == 16) {
-      vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,
-                                    fdata2, 16,
-                                    h + 7, filter_x);
-      vp9_filter_block1d16_v8_avg_ssse3(fdata2, 16,
-                                        dst, dst_stride,
-                                        h, filter_y);
-      return;
-    }
-    if (w == 8) {
-      vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,
-                                   fdata2, 16,
-                                   h + 7, filter_x);
-      vp9_filter_block1d8_v8_avg_ssse3(fdata2, 16,
-                                       dst, dst_stride,
-                                       h, filter_y);
-      return;
-    }
-    if (w == 4) {
-      vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride,
-                                   fdata2, 16,
-                                   h + 7, filter_x);
-      vp9_filter_block1d4_v8_avg_ssse3(fdata2, 16,
-                                       dst, dst_stride,
-                                       h, filter_y);
-      return;
-    }
+  assert(w <= 64);
+  assert(h <= 64);
+  if (x_step_q4 == 16 && y_step_q4 == 16) {
+    vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h + 7);
+    vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
+                                 filter_x, x_step_q4, filter_y, y_step_q4,
+                                 w, h);
+  } else {
+    vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+                        filter_x, x_step_q4, filter_y, y_step_q4, w, h);
   }
-  vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4,
-                      w, h);
 }
 #endif
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -73,7 +73,7 @@
   *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
 }
 
-void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
   const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
@@ -81,7 +81,6 @@
                                     (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
                                     (int16_t)cospi_8_64, (int16_t)cospi_24_64);
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const int half_pitch = pitch >> 1;
   __m128i input0, input1, input2, input3;
 
   // Rows
@@ -188,14 +187,23 @@
   input2 = _mm_srai_epi16(input2, 4);
   input3 = _mm_srai_epi16(input3, 4);
 
-  // Store results
-  _mm_storel_epi64((__m128i *)output, input2);
-  input2 = _mm_srli_si128(input2, 8);
-  _mm_storel_epi64((__m128i *)(output + half_pitch), input2);
+#define RECON_AND_STORE4X4(dest, in_x) \
+  {                                                     \
+      __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
+      d0 = _mm_unpacklo_epi8(d0, zero); \
+      d0 = _mm_add_epi16(in_x, d0); \
+      d0 = _mm_packus_epi16(d0, d0); \
+      *(int *)dest = _mm_cvtsi128_si32(d0); \
+      dest += stride; \
+  }
 
-  _mm_storel_epi64((__m128i *)(output + 3 * half_pitch), input3);
-  input3 = _mm_srli_si128(input3, 8);
-  _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3);
+  input0 = _mm_srli_si128(input2, 8);
+  input1 = _mm_srli_si128(input3, 8);
+
+  RECON_AND_STORE4X4(dest, input2);
+  RECON_AND_STORE4X4(dest, input0);
+  RECON_AND_STORE4X4(dest, input1);
+  RECON_AND_STORE4X4(dest, input3);
 }
 
 void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
@@ -403,8 +411,18 @@
   in6 = _mm_subs_epi16(stp1_1, stp1_6); \
   in7 = _mm_subs_epi16(stp1_0, stp2_7);
 
-void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
-  const int half_pitch = pitch >> 1;
+#define RECON_AND_STORE(dest, in_x) \
+  {                                                     \
+     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+      d0 = _mm_unpacklo_epi8(d0, zero); \
+      in_x = _mm_add_epi16(in_x, d0); \
+      in_x = _mm_packus_epi16(in_x, in_x); \
+      _mm_storel_epi64((__m128i *)(dest), in_x); \
+      dest += stride; \
+  }
+
+void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+  const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<4);
   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
@@ -461,19 +479,17 @@
   in6 = _mm_srai_epi16(in6, 5);
   in7 = _mm_srai_epi16(in7, 5);
 
-  // Store results
-  _mm_store_si128((__m128i *)output, in0);
-  _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
-  _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
-  _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
-  _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
-  _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
-  _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
-  _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
+  RECON_AND_STORE(dest, in0);
+  RECON_AND_STORE(dest, in1);
+  RECON_AND_STORE(dest, in2);
+  RECON_AND_STORE(dest, in3);
+  RECON_AND_STORE(dest, in4);
+  RECON_AND_STORE(dest, in5);
+  RECON_AND_STORE(dest, in6);
+  RECON_AND_STORE(dest, in7);
 }
 
-void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
-  const int half_pitch = pitch >> 1;
+void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<4);
@@ -612,15 +628,14 @@
   in6 = _mm_srai_epi16(in6, 5);
   in7 = _mm_srai_epi16(in7, 5);
 
-  // Store results
-  _mm_store_si128((__m128i *)output, in0);
-  _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
-  _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
-  _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
-  _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
-  _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
-  _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
-  _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
+  RECON_AND_STORE(dest, in0);
+  RECON_AND_STORE(dest, in1);
+  RECON_AND_STORE(dest, in2);
+  RECON_AND_STORE(dest, in3);
+  RECON_AND_STORE(dest, in4);
+  RECON_AND_STORE(dest, in5);
+  RECON_AND_STORE(dest, in6);
+  RECON_AND_STORE(dest, in7);
 }
 
 #define IDCT16x16_1D \
@@ -752,8 +767,7 @@
                            stp2_10, stp2_13, stp2_11, stp2_12) \
   }
 
-void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
-  const int half_pitch = pitch >> 1;
+void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
   const __m128i zero = _mm_setzero_si128();
@@ -938,31 +952,30 @@
       in14 = _mm_srai_epi16(in14, 6);
       in15 = _mm_srai_epi16(in15, 6);
 
-      // Store results
-      _mm_store_si128((__m128i *)output, in0);
-      _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
-      _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
-      _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
-      _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
-      _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
-      _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
-      _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
-      _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
-      _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
-      _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
-      _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
-      _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
-      _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
-      _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
-      _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
+      RECON_AND_STORE(dest, in0);
+      RECON_AND_STORE(dest, in1);
+      RECON_AND_STORE(dest, in2);
+      RECON_AND_STORE(dest, in3);
+      RECON_AND_STORE(dest, in4);
+      RECON_AND_STORE(dest, in5);
+      RECON_AND_STORE(dest, in6);
+      RECON_AND_STORE(dest, in7);
+      RECON_AND_STORE(dest, in8);
+      RECON_AND_STORE(dest, in9);
+      RECON_AND_STORE(dest, in10);
+      RECON_AND_STORE(dest, in11);
+      RECON_AND_STORE(dest, in12);
+      RECON_AND_STORE(dest, in13);
+      RECON_AND_STORE(dest, in14);
+      RECON_AND_STORE(dest, in15);
 
-      output += 8;
+      dest += 8 - (stride * 16);
     }
   }
 }
 
-void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
-  const int half_pitch = pitch >> 1;
+void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
+                                     int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
   const __m128i zero = _mm_setzero_si128();
@@ -1007,7 +1020,6 @@
           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i;
-
   // 1-D idct. Load input data.
   in0 = _mm_load_si128((__m128i *)input);
   in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
@@ -1298,29 +1310,28 @@
     in14 = _mm_srai_epi16(in14, 6);
     in15 = _mm_srai_epi16(in15, 6);
 
-    // Store results
-    _mm_store_si128((__m128i *)output, in0);
-    _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
-    _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
-    _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
-    _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
-    _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
-    _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
-    _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
-    _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
-    _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
-    _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
-    _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
-    _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
-    _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
-    _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
-    _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
-    output += 8;
+    RECON_AND_STORE(dest, in0);
+    RECON_AND_STORE(dest, in1);
+    RECON_AND_STORE(dest, in2);
+    RECON_AND_STORE(dest, in3);
+    RECON_AND_STORE(dest, in4);
+    RECON_AND_STORE(dest, in5);
+    RECON_AND_STORE(dest, in6);
+    RECON_AND_STORE(dest, in7);
+    RECON_AND_STORE(dest, in8);
+    RECON_AND_STORE(dest, in9);
+    RECON_AND_STORE(dest, in10);
+    RECON_AND_STORE(dest, in11);
+    RECON_AND_STORE(dest, in12);
+    RECON_AND_STORE(dest, in13);
+    RECON_AND_STORE(dest, in14);
+    RECON_AND_STORE(dest, in15);
+
+    dest += 8 - (stride * 16);
   }
 }
 
-void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {
-  const int half_pitch = pitch >> 1;
+void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
 
@@ -1832,6 +1843,8 @@
       col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
       col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
     } else {
+      const __m128i zero = _mm_setzero_si128();
+
       // 2_D: Calculate the results and store them to destination.
       in0 = _mm_add_epi16(stp1_0, stp1_31);
       in1 = _mm_add_epi16(stp1_1, stp1_30);
@@ -1933,41 +1946,40 @@
       in30 = _mm_srai_epi16(in30, 6);
       in31 = _mm_srai_epi16(in31, 6);
 
-      // Store results
-      _mm_store_si128((__m128i *)output, in0);
-      _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
-      _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
-      _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
-      _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
-      _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
-      _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
-      _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
-      _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
-      _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
-      _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
-      _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
-      _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
-      _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
-      _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
-      _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
-      _mm_store_si128((__m128i *)(output + half_pitch * 16), in16);
-      _mm_store_si128((__m128i *)(output + half_pitch * 17), in17);
-      _mm_store_si128((__m128i *)(output + half_pitch * 18), in18);
-      _mm_store_si128((__m128i *)(output + half_pitch * 19), in19);
-      _mm_store_si128((__m128i *)(output + half_pitch * 20), in20);
-      _mm_store_si128((__m128i *)(output + half_pitch * 21), in21);
-      _mm_store_si128((__m128i *)(output + half_pitch * 22), in22);
-      _mm_store_si128((__m128i *)(output + half_pitch * 23), in23);
-      _mm_store_si128((__m128i *)(output + half_pitch * 24), in24);
-      _mm_store_si128((__m128i *)(output + half_pitch * 25), in25);
-      _mm_store_si128((__m128i *)(output + half_pitch * 26), in26);
-      _mm_store_si128((__m128i *)(output + half_pitch * 27), in27);
-      _mm_store_si128((__m128i *)(output + half_pitch * 28), in28);
-      _mm_store_si128((__m128i *)(output + half_pitch * 29), in29);
-      _mm_store_si128((__m128i *)(output + half_pitch * 30), in30);
-      _mm_store_si128((__m128i *)(output + half_pitch * 31), in31);
+      RECON_AND_STORE(dest, in0);
+      RECON_AND_STORE(dest, in1);
+      RECON_AND_STORE(dest, in2);
+      RECON_AND_STORE(dest, in3);
+      RECON_AND_STORE(dest, in4);
+      RECON_AND_STORE(dest, in5);
+      RECON_AND_STORE(dest, in6);
+      RECON_AND_STORE(dest, in7);
+      RECON_AND_STORE(dest, in8);
+      RECON_AND_STORE(dest, in9);
+      RECON_AND_STORE(dest, in10);
+      RECON_AND_STORE(dest, in11);
+      RECON_AND_STORE(dest, in12);
+      RECON_AND_STORE(dest, in13);
+      RECON_AND_STORE(dest, in14);
+      RECON_AND_STORE(dest, in15);
+      RECON_AND_STORE(dest, in16);
+      RECON_AND_STORE(dest, in17);
+      RECON_AND_STORE(dest, in18);
+      RECON_AND_STORE(dest, in19);
+      RECON_AND_STORE(dest, in20);
+      RECON_AND_STORE(dest, in21);
+      RECON_AND_STORE(dest, in22);
+      RECON_AND_STORE(dest, in23);
+      RECON_AND_STORE(dest, in24);
+      RECON_AND_STORE(dest, in25);
+      RECON_AND_STORE(dest, in26);
+      RECON_AND_STORE(dest, in27);
+      RECON_AND_STORE(dest, in28);
+      RECON_AND_STORE(dest, in29);
+      RECON_AND_STORE(dest, in30);
+      RECON_AND_STORE(dest, in31);
 
-      output += 8;
+      dest += 8 - (stride * 32);
     }
   }
 }
--- a/vp9/common/x86/vp9_idct_sse2.asm
+++ /dev/null
@@ -1,712 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_idct_dequant_0_2x_sse2
-; (
-;   short *qcoeff       - 0
-;   short *dequant      - 1
-;   unsigned char *pre  - 2
-;   unsigned char *dst  - 3
-;   int dst_stride      - 4
-;   int blk_stride      - 5
-; )
-
-global sym(vp9_idct_dequant_0_2x_sse2) PRIVATE
-sym(vp9_idct_dequant_0_2x_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    ; end prolog
-
-        mov         rdx,            arg(1) ; dequant
-        mov         rax,            arg(0) ; qcoeff
-
-        movd        xmm4,           [rax]
-        movd        xmm5,           [rdx]
-
-        pinsrw      xmm4,           [rax+32],   4
-        pinsrw      xmm5,           [rdx],      4
-
-        pmullw      xmm4,           xmm5
-
-    ; Zero out xmm5, for use unpacking
-        pxor        xmm5,           xmm5
-
-    ; clear coeffs
-        movd        [rax],          xmm5
-        movd        [rax+32],       xmm5
-;pshufb
-        pshuflw     xmm4,           xmm4,       00000000b
-        pshufhw     xmm4,           xmm4,       00000000b
-
-        mov         rax,            arg(2) ; pre
-        paddw       xmm4,           [GLOBAL(fours)]
-
-        movsxd      rcx,            dword ptr arg(5) ; blk_stride
-        psraw       xmm4,           3
-
-        movq        xmm0,           [rax]
-        movq        xmm1,           [rax+rcx]
-        movq        xmm2,           [rax+2*rcx]
-        lea         rcx,            [3*rcx]
-        movq        xmm3,           [rax+rcx]
-
-        punpcklbw   xmm0,           xmm5
-        punpcklbw   xmm1,           xmm5
-        punpcklbw   xmm2,           xmm5
-        punpcklbw   xmm3,           xmm5
-
-        mov         rax,            arg(3) ; dst
-        movsxd      rdx,            dword ptr arg(4) ; dst_stride
-
-    ; Add to predict buffer
-        paddw       xmm0,           xmm4
-        paddw       xmm1,           xmm4
-        paddw       xmm2,           xmm4
-        paddw       xmm3,           xmm4
-
-    ; pack up before storing
-        packuswb    xmm0,           xmm5
-        packuswb    xmm1,           xmm5
-        packuswb    xmm2,           xmm5
-        packuswb    xmm3,           xmm5
-
-    ; store blocks back out
-        movq        [rax],          xmm0
-        movq        [rax + rdx],    xmm1
-
-        lea         rax,            [rax + 2*rdx]
-
-        movq        [rax],          xmm2
-        movq        [rax + rdx],    xmm3
-
-    ; begin epilog
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vp9_idct_dequant_full_2x_sse2) PRIVATE
-sym(vp9_idct_dequant_full_2x_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ; special case when 2 blocks have 0 or 1 coeffs
-    ; dc is set as first coeff, so no need to load qcoeff
-        mov         rax,            arg(0) ; qcoeff
-        mov         rsi,            arg(2) ; pre
-        mov         rdi,            arg(3) ; dst
-        movsxd      rcx,            dword ptr arg(5) ; blk_stride
-
-    ; Zero out xmm7, for use unpacking
-        pxor        xmm7,           xmm7
-
-        mov         rdx,            arg(1)  ; dequant
-
-    ; note the transpose of xmm1 and xmm2, necessary for shuffle
-    ;   to spit out sensicle data
-        movdqa      xmm0,           [rax]
-        movdqa      xmm2,           [rax+16]
-        movdqa      xmm1,           [rax+32]
-        movdqa      xmm3,           [rax+48]
-
-    ; Clear out coeffs
-        movdqa      [rax],          xmm7
-        movdqa      [rax+16],       xmm7
-        movdqa      [rax+32],       xmm7
-        movdqa      [rax+48],       xmm7
-
-    ; dequantize qcoeff buffer
-        pmullw      xmm0,           [rdx]
-        pmullw      xmm2,           [rdx+16]
-        pmullw      xmm1,           [rdx]
-        pmullw      xmm3,           [rdx+16]
-
-    ; repack so block 0 row x and block 1 row x are together
-        movdqa      xmm4,           xmm0
-        punpckldq   xmm0,           xmm1
-        punpckhdq   xmm4,           xmm1
-
-        pshufd      xmm0,           xmm0,       11011000b
-        pshufd      xmm1,           xmm4,       11011000b
-
-        movdqa      xmm4,           xmm2
-        punpckldq   xmm2,           xmm3
-        punpckhdq   xmm4,           xmm3
-
-        pshufd      xmm2,           xmm2,       11011000b
-        pshufd      xmm3,           xmm4,       11011000b
-
-    ; first pass
-        psubw       xmm0,           xmm2        ; b1 = 0-2
-        paddw       xmm2,           xmm2        ;
-
-        movdqa      xmm5,           xmm1
-        paddw       xmm2,           xmm0        ; a1 = 0+2
-
-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
-        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
-
-        movdqa      xmm7,           xmm3
-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
-
-        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
-        psubw       xmm7,           xmm5        ; c1
-
-        movdqa      xmm5,           xmm1
-        movdqa      xmm4,           xmm3
-
-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
-        paddw       xmm5,           xmm1
-
-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
-        paddw       xmm3,           xmm4
-
-        paddw       xmm3,           xmm5        ; d1
-        movdqa      xmm6,           xmm2        ; a1
-
-        movdqa      xmm4,           xmm0        ; b1
-        paddw       xmm2,           xmm3        ;0
-
-        paddw       xmm4,           xmm7        ;1
-        psubw       xmm0,           xmm7        ;2
-
-        psubw       xmm6,           xmm3        ;3
-
-    ; transpose for the second pass
-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
-
-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
-
-
-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
-
-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
-
-
-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
-
-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
-
-        pshufd      xmm0,           xmm2,       11011000b
-        pshufd      xmm2,           xmm1,       11011000b
-
-        pshufd      xmm1,           xmm5,       11011000b
-        pshufd      xmm3,           xmm7,       11011000b
-
-    ; second pass
-        psubw       xmm0,           xmm2            ; b1 = 0-2
-        paddw       xmm2,           xmm2
-
-        movdqa      xmm5,           xmm1
-        paddw       xmm2,           xmm0            ; a1 = 0+2
-
-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
-        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
-
-        movdqa      xmm7,           xmm3
-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
-
-        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
-        psubw       xmm7,           xmm5            ; c1
-
-        movdqa      xmm5,           xmm1
-        movdqa      xmm4,           xmm3
-
-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
-        paddw       xmm5,           xmm1
-
-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
-        paddw       xmm3,           xmm4
-
-        paddw       xmm3,           xmm5            ; d1
-        paddw       xmm0,           [GLOBAL(fours)]
-
-        paddw       xmm2,           [GLOBAL(fours)]
-        movdqa      xmm6,           xmm2            ; a1
-
-        movdqa      xmm4,           xmm0            ; b1
-        paddw       xmm2,           xmm3            ;0
-
-        paddw       xmm4,           xmm7            ;1
-        psubw       xmm0,           xmm7            ;2
-
-        psubw       xmm6,           xmm3            ;3
-        psraw       xmm2,           3
-
-        psraw       xmm0,           3
-        psraw       xmm4,           3
-
-        psraw       xmm6,           3
-
-    ; transpose to save
-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
-
-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
-
-
-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
-
-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
-
-
-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
-
-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
-
-        pshufd      xmm0,           xmm2,       11011000b
-        pshufd      xmm2,           xmm1,       11011000b
-
-        pshufd      xmm1,           xmm5,       11011000b
-        pshufd      xmm3,           xmm7,       11011000b
-
-        pxor        xmm7,           xmm7
-
-    ; Load up predict blocks
-        movq        xmm4,           [rsi]
-        movq        xmm5,           [rsi+rcx]
-
-        punpcklbw   xmm4,           xmm7
-        punpcklbw   xmm5,           xmm7
-
-        paddw       xmm0,           xmm4
-        paddw       xmm1,           xmm5
-
-        movq        xmm4,           [rsi+2*rcx]
-        lea         rcx,            [3*rcx]
-        movq        xmm5,           [rsi+rcx]
-
-        punpcklbw   xmm4,           xmm7
-        punpcklbw   xmm5,           xmm7
-
-        paddw       xmm2,           xmm4
-        paddw       xmm3,           xmm5
-
-.finish:
-
-    ; pack up before storing
-        packuswb    xmm0,           xmm7
-        packuswb    xmm1,           xmm7
-        packuswb    xmm2,           xmm7
-        packuswb    xmm3,           xmm7
-
-    ; Load destination stride before writing out,
-    ;   doesn't need to persist
-        movsxd      rdx,            dword ptr arg(4) ; dst_stride
-
-    ; store blocks back out
-        movq        [rdi],          xmm0
-        movq        [rdi + rdx],    xmm1
-
-        lea         rdi,            [rdi + 2*rdx]
-
-        movq        [rdi],          xmm2
-        movq        [rdi + rdx],    xmm3
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_idct_dequant_dc_0_2x_sse2
-; (
-;   short *qcoeff       - 0
-;   short *dequant      - 1
-;   unsigned char *pre  - 2
-;   unsigned char *dst  - 3
-;   int dst_stride      - 4
-;   short *dc           - 5
-; )
-global sym(vp9_idct_dequant_dc_0_2x_sse2) PRIVATE
-sym(vp9_idct_dequant_dc_0_2x_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ; special case when 2 blocks have 0 or 1 coeffs
-    ; dc is set as first coeff, so no need to load qcoeff
-        mov         rax,            arg(0) ; qcoeff
-        mov         rsi,            arg(2) ; pre
-        mov         rdi,            arg(3) ; dst
-        mov         rdx,            arg(5) ; dc
-
-    ; Zero out xmm5, for use unpacking
-        pxor        xmm5,           xmm5
-
-    ; load up 2 dc words here == 2*16 = doubleword
-        movd        xmm4,           [rdx]
-
-    ; Load up predict blocks
-        movq        xmm0,           [rsi]
-        movq        xmm1,           [rsi+16]
-        movq        xmm2,           [rsi+32]
-        movq        xmm3,           [rsi+48]
-
-    ; Duplicate and expand dc across
-        punpcklwd   xmm4,           xmm4
-        punpckldq   xmm4,           xmm4
-
-    ; Rounding to dequant and downshift
-        paddw       xmm4,           [GLOBAL(fours)]
-        psraw       xmm4,           3
-
-    ; Predict buffer needs to be expanded from bytes to words
-        punpcklbw   xmm0,           xmm5
-        punpcklbw   xmm1,           xmm5
-        punpcklbw   xmm2,           xmm5
-        punpcklbw   xmm3,           xmm5
-
-    ; Add to predict buffer
-        paddw       xmm0,           xmm4
-        paddw       xmm1,           xmm4
-        paddw       xmm2,           xmm4
-        paddw       xmm3,           xmm4
-
-    ; pack up before storing
-        packuswb    xmm0,           xmm5
-        packuswb    xmm1,           xmm5
-        packuswb    xmm2,           xmm5
-        packuswb    xmm3,           xmm5
-
-    ; Load destination stride before writing out,
-    ;   doesn't need to persist
-        movsxd      rdx,            dword ptr arg(4) ; dst_stride
-
-    ; store blocks back out
-        movq        [rdi],          xmm0
-        movq        [rdi + rdx],    xmm1
-
-        lea         rdi,            [rdi + 2*rdx]
-
-        movq        [rdi],          xmm2
-        movq        [rdi + rdx],    xmm3
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(vp9_idct_dequant_dc_full_2x_sse2) PRIVATE
-sym(vp9_idct_dequant_dc_full_2x_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ; special case when 2 blocks have 0 or 1 coeffs
-    ; dc is set as first coeff, so no need to load qcoeff
-        mov         rax,            arg(0) ; qcoeff
-        mov         rsi,            arg(2) ; pre
-        mov         rdi,            arg(3) ; dst
-
-    ; Zero out xmm7, for use unpacking
-        pxor        xmm7,           xmm7
-
-        mov         rdx,            arg(1)  ; dequant
-
-    ; note the transpose of xmm1 and xmm2, necessary for shuffle
-    ;   to spit out sensicle data
-        movdqa      xmm0,           [rax]
-        movdqa      xmm2,           [rax+16]
-        movdqa      xmm1,           [rax+32]
-        movdqa      xmm3,           [rax+48]
-
-    ; Clear out coeffs
-        movdqa      [rax],          xmm7
-        movdqa      [rax+16],       xmm7
-        movdqa      [rax+32],       xmm7
-        movdqa      [rax+48],       xmm7
-
-    ; dequantize qcoeff buffer
-        pmullw      xmm0,           [rdx]
-        pmullw      xmm2,           [rdx+16]
-        pmullw      xmm1,           [rdx]
-        pmullw      xmm3,           [rdx+16]
-
-    ; DC component
-        mov         rdx,            arg(5)
-
-    ; repack so block 0 row x and block 1 row x are together
-        movdqa      xmm4,           xmm0
-        punpckldq   xmm0,           xmm1
-        punpckhdq   xmm4,           xmm1
-
-        pshufd      xmm0,           xmm0,       11011000b
-        pshufd      xmm1,           xmm4,       11011000b
-
-        movdqa      xmm4,           xmm2
-        punpckldq   xmm2,           xmm3
-        punpckhdq   xmm4,           xmm3
-
-        pshufd      xmm2,           xmm2,       11011000b
-        pshufd      xmm3,           xmm4,       11011000b
-
-    ; insert DC component
-        pinsrw      xmm0,           [rdx],      0
-        pinsrw      xmm0,           [rdx+2],    4
-
-    ; first pass
-        psubw       xmm0,           xmm2        ; b1 = 0-2
-        paddw       xmm2,           xmm2        ;
-
-        movdqa      xmm5,           xmm1
-        paddw       xmm2,           xmm0        ; a1 = 0+2
-
-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
-        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
-
-        movdqa      xmm7,           xmm3
-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
-
-        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
-        psubw       xmm7,           xmm5        ; c1
-
-        movdqa      xmm5,           xmm1
-        movdqa      xmm4,           xmm3
-
-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
-        paddw       xmm5,           xmm1
-
-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
-        paddw       xmm3,           xmm4
-
-        paddw       xmm3,           xmm5        ; d1
-        movdqa      xmm6,           xmm2        ; a1
-
-        movdqa      xmm4,           xmm0        ; b1
-        paddw       xmm2,           xmm3        ;0
-
-        paddw       xmm4,           xmm7        ;1
-        psubw       xmm0,           xmm7        ;2
-
-        psubw       xmm6,           xmm3        ;3
-
-    ; transpose for the second pass
-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
-
-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
-
-
-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
-
-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
-
-
-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
-
-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
-
-        pshufd      xmm0,           xmm2,       11011000b
-        pshufd      xmm2,           xmm1,       11011000b
-
-        pshufd      xmm1,           xmm5,       11011000b
-        pshufd      xmm3,           xmm7,       11011000b
-
-    ; second pass
-        psubw       xmm0,           xmm2            ; b1 = 0-2
-        paddw       xmm2,           xmm2
-
-        movdqa      xmm5,           xmm1
-        paddw       xmm2,           xmm0            ; a1 = 0+2
-
-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
-        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
-
-        movdqa      xmm7,           xmm3
-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
-
-        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
-        psubw       xmm7,           xmm5            ; c1
-
-        movdqa      xmm5,           xmm1
-        movdqa      xmm4,           xmm3
-
-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
-        paddw       xmm5,           xmm1
-
-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
-        paddw       xmm3,           xmm4
-
-        paddw       xmm3,           xmm5            ; d1
-        paddw       xmm0,           [GLOBAL(fours)]
-
-        paddw       xmm2,           [GLOBAL(fours)]
-        movdqa      xmm6,           xmm2            ; a1
-
-        movdqa      xmm4,           xmm0            ; b1
-        paddw       xmm2,           xmm3            ;0
-
-        paddw       xmm4,           xmm7            ;1
-        psubw       xmm0,           xmm7            ;2
-
-        psubw       xmm6,           xmm3            ;3
-        psraw       xmm2,           3
-
-        psraw       xmm0,           3
-        psraw       xmm4,           3
-
-        psraw       xmm6,           3
-
-    ; transpose to save
-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
-
-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
-
-
-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
-
-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
-
-
-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
-
-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
-
-        pshufd      xmm0,           xmm2,       11011000b
-        pshufd      xmm2,           xmm1,       11011000b
-
-        pshufd      xmm1,           xmm5,       11011000b
-        pshufd      xmm3,           xmm7,       11011000b
-
-        pxor        xmm7,           xmm7
-
-    ; Load up predict blocks
-        movq        xmm4,           [rsi]
-        movq        xmm5,           [rsi+16]
-
-        punpcklbw   xmm4,           xmm7
-        punpcklbw   xmm5,           xmm7
-
-        paddw       xmm0,           xmm4
-        paddw       xmm1,           xmm5
-
-        movq        xmm4,           [rsi+32]
-        movq        xmm5,           [rsi+48]
-
-        punpcklbw   xmm4,           xmm7
-        punpcklbw   xmm5,           xmm7
-
-        paddw       xmm2,           xmm4
-        paddw       xmm3,           xmm5
-
-.finish:
-
-    ; pack up before storing
-        packuswb    xmm0,           xmm7
-        packuswb    xmm1,           xmm7
-        packuswb    xmm2,           xmm7
-        packuswb    xmm3,           xmm7
-
-    ; Load destination stride before writing out,
-    ;   doesn't need to persist
-        movsxd      rdx,            dword ptr arg(4) ; dst_stride
-
-    ; store blocks back out
-        movq        [rdi],          xmm0
-        movq        [rdi + rdx],    xmm1
-
-        lea         rdi,            [rdi + 2*rdx]
-
-        movq        [rdi],          xmm2
-        movq        [rdi + rdx],    xmm3
-
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-fours:
-    times 8 dw 0x0004
-align 16
-x_s1sqr2:
-    times 8 dw 0x8A8C
-align 16
-x_c1sqr2less1:
-    times 8 dw 0x4E7B
--- a/vp9/common/x86/vp9_loopfilter_intrin_mmx.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_mmx.c
@@ -35,16 +35,6 @@
 
 }
 
-void vp9_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride,
-                             const unsigned char *blimit) {
-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride,
-                                             y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride,
-                                             y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride,
-                                             y_stride, blimit);
-}
-
 /* Vertical B Filtering */
 void vp9_loop_filter_bv_mmx(unsigned char *y_ptr,
                             unsigned char *u_ptr, unsigned char *v_ptr,
@@ -66,9 +56,3 @@
                                       lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride,
-                             const unsigned char *blimit) {
-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
-}
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -1115,16 +1115,6 @@
                                             v_ptr + 4 * uv_stride);
 }
 
-void vp9_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride,
-                              const unsigned char *blimit) {
-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride,
-                                              y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride,
-                                              y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride,
-                                              y_stride, blimit);
-}
-
 /* Vertical B Filtering */
 void vp9_loop_filter_bv_sse2(unsigned char *y_ptr,
                              unsigned char *u_ptr, unsigned char *v_ptr,
@@ -1143,9 +1133,3 @@
                                           v_ptr + 4);
 }
 
-void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride,
-                              const unsigned char *blimit) {
-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
-}
--- a/vp9/common/x86/vp9_loopfilter_mmx.asm
+++ b/vp9/common/x86/vp9_loopfilter_mmx.asm
@@ -593,349 +593,6 @@
     pop         rbp
     ret
 
-
-;void vp9_loop_filter_simple_horizontal_edge_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit
-;)
-global sym(vp9_loop_filter_simple_horizontal_edge_mmx) PRIVATE
-sym(vp9_loop_filter_simple_horizontal_edge_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        mov         rcx, 2                ; count
-.nexts8_h:
-        mov         rdx, arg(2) ;blimit           ; get blimit
-        movq        mm3, [rdx]            ;
-
-        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
-        neg         rax
-
-        ; calculate mask
-        movq        mm1, [rsi+2*rax]      ; p1
-        movq        mm0, [rdi]            ; q1
-        movq        mm2, mm1
-        movq        mm7, mm0
-        movq        mm4, mm0
-        psubusb     mm0, mm1              ; q1-=p1
-        psubusb     mm1, mm4              ; p1-=q1
-        por         mm1, mm0              ; abs(p1-q1)
-        pand        mm1, [GLOBAL(tfe)]    ; set lsb of each byte to zero
-        psrlw       mm1, 1                ; abs(p1-q1)/2
-
-        movq        mm5, [rsi+rax]        ; p0
-        movq        mm4, [rsi]            ; q0
-        movq        mm0, mm4              ; q0
-        movq        mm6, mm5              ; p0
-        psubusb     mm5, mm4              ; p0-=q0
-        psubusb     mm4, mm6              ; q0-=p0
-        por         mm5, mm4              ; abs(p0 - q0)
-        paddusb     mm5, mm5              ; abs(p0-q0)*2
-        paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        mm3, mm3
-        pcmpeqb     mm5, mm3
-
-        ; start work on filters
-        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
-        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
-        psubsb      mm2, mm7              ; p1 - q1
-
-        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
-        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
-        movq        mm3, mm0              ; q0
-        psubsb      mm0, mm6              ; q0 - p0
-        paddsb      mm2, mm0              ; p1 - q1 + 1 * (q0 - p0)
-        paddsb      mm2, mm0              ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      mm2, mm0              ; p1 - q1 + 3 * (q0 - p0)
-        pand        mm5, mm2              ; mask filter values we don't care about
-
-        ; do + 4 side
-        paddsb      mm5, [GLOBAL(t4)]     ; 3* (q0 - p0) + (p1 - q1) + 4
-
-        movq        mm0, mm5              ; get a copy of filters
-        psllw       mm0, 8                ; shift left 8
-        psraw       mm0, 3                ; arithmetic shift right 11
-        psrlw       mm0, 8
-        movq        mm1, mm5              ; get a copy of filters
-        psraw       mm1, 11               ; arithmetic shift right 11
-        psllw       mm1, 8                ; shift left 8 to put it back
-
-        por         mm0, mm1              ; put the two together to get result
-
-        psubsb      mm3, mm0              ; q0-= q0 add
-        pxor        mm3, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi], mm3            ; write back
-
-
-        ; now do +3 side
-        psubsb      mm5, [GLOBAL(t1s)]     ; +3 instead of +4
-
-        movq        mm0, mm5              ; get a copy of filters
-        psllw       mm0, 8                ; shift left 8
-        psraw       mm0, 3                ; arithmetic shift right 11
-        psrlw       mm0, 8
-        psraw       mm5, 11               ; arithmetic shift right 11
-        psllw       mm5, 8                ; shift left 8 to put it back
-        por         mm0, mm5              ; put the two together to get result
-
-
-        paddsb      mm6, mm0              ; p0+= p0 add
-        pxor        mm6, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi+rax], mm6        ; write back
-
-        add         rsi,8
-        neg         rax
-        dec         rcx
-        jnz         .nexts8_h
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_loop_filter_simple_vertical_edge_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit
-;)
-global sym(vp9_loop_filter_simple_vertical_edge_mmx) PRIVATE
-sym(vp9_loop_filter_simple_vertical_edge_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub          rsp, 32      ; reserve 32 bytes
-    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
-    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        lea         rsi, [rsi + rax*4- 2];  ;
-        mov         rcx, 2                                      ; count
-.nexts8_v:
-
-        lea         rdi,        [rsi + rax];
-        movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70
-
-        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 63 62 61 60
-        punpcklbw   mm6,        mm0                             ; 73 63 72 62 71 61 70 60
-
-        movd        mm0,        [rsi + rax]                     ; xx xx xx xx 53 52 51 50
-        movd        mm4,        [rsi]                           ; xx xx xx xx 43 42 41 40
-
-        punpcklbw   mm4,        mm0                             ; 53 43 52 42 51 41 50 40
-        movq        mm5,        mm4                             ; 53 43 52 42 51 41 50 40
-
-        punpcklwd   mm4,        mm6                             ; 71 61 51 41 70 60 50 40
-        punpckhwd   mm5,        mm6                             ; 73 63 53 43 72 62 52 42
-
-        neg         rax
-
-        movd        mm7,        [rsi + rax]                     ; xx xx xx xx 33 32 31 30
-        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 23 22 21 20
-
-        punpcklbw   mm6,        mm7                             ; 33 23 32 22 31 21 30 20
-        movd        mm1,        [rdi + rax * 4]                 ; xx xx xx xx 13 12 11 10
-
-        movd        mm0,        [rsi + rax * 4]                 ; xx xx xx xx 03 02 01 00
-        punpcklbw   mm0,        mm1                             ; 13 03 12 02 11 01 10 00
-
-        movq        mm2,        mm0                             ; 13 03 12 02 11 01 10 00
-        punpcklwd   mm0,        mm6                             ; 31 21 11 01 30 20 10 00
-
-        punpckhwd   mm2,        mm6                             ; 33 23 13 03 32 22 12 02
-        movq        mm1,        mm0                             ; 13 03 12 02 11 01 10 00
-
-        punpckldq   mm0,        mm4                             ; 70 60 50 40 30 20 10 00       = p1
-        movq        mm3,        mm2                             ; 33 23 13 03 32 22 12 02
-
-        punpckhdq   mm1,        mm4                             ; 71 61 51 41 31 21 11 01       = p0
-        punpckldq   mm2,        mm5                             ; 72 62 52 42 32 22 12 02       = q0
-
-        punpckhdq   mm3,        mm5                             ; 73 63 53 43 33 23 13 03       = q1
-
-
-        ; calculate mask
-        movq        mm6,        mm0                             ; p1
-        movq        mm7,        mm3                             ; q1
-        psubusb     mm7,        mm6                             ; q1-=p1
-        psubusb     mm6,        mm3                             ; p1-=q1
-        por         mm6,        mm7                             ; abs(p1-q1)
-        pand        mm6,        [GLOBAL(tfe)]                   ; set lsb of each byte to zero
-        psrlw       mm6,        1                               ; abs(p1-q1)/2
-
-        movq        mm5,        mm1                             ; p0
-        movq        mm4,        mm2                             ; q0
-
-        psubusb     mm5,        mm2                             ; p0-=q0
-        psubusb     mm4,        mm1                             ; q0-=p0
-
-        por         mm5,        mm4                             ; abs(p0 - q0)
-        paddusb     mm5,        mm5                             ; abs(p0-q0)*2
-        paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        mov         rdx,        arg(2) ;blimit                          ; get blimit
-        movq        mm7,        [rdx]
-
-        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        mm7,        mm7
-        pcmpeqb     mm5,        mm7                             ; mm5 = mask
-
-        ; start work on filters
-        movq        t0,         mm0
-        movq        t1,         mm3
-
-        pxor        mm0,        [GLOBAL(t80)]                   ; p1 offset to convert to signed values
-        pxor        mm3,        [GLOBAL(t80)]                   ; q1 offset to convert to signed values
-
-        psubsb      mm0,        mm3                             ; p1 - q1
-        movq        mm6,        mm1                             ; p0
-
-        movq        mm7,        mm2                             ; q0
-        pxor        mm6,        [GLOBAL(t80)]                   ; offset to convert to signed values
-
-        pxor        mm7,        [GLOBAL(t80)]                   ; offset to convert to signed values
-        movq        mm3,        mm7                             ; offseted ; q0
-
-        psubsb      mm7,        mm6                             ; q0 - p0
-        paddsb      mm0,        mm7                             ; p1 - q1 + 1 * (q0 - p0)
-
-        paddsb      mm0,        mm7                             ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      mm0,        mm7                             ; p1 - q1 + 3 * (q0 - p0)
-
-        pand        mm5,        mm0                             ; mask filter values we don't care about
-
-        paddsb      mm5,        [GLOBAL(t4)]                    ;  3* (q0 - p0) + (p1 - q1) + 4
-
-        movq        mm0,        mm5                             ; get a copy of filters
-        psllw       mm0,        8                               ; shift left 8
-        psraw       mm0,        3                               ; arithmetic shift right 11
-        psrlw       mm0,        8
-
-        movq        mm7,        mm5                             ; get a copy of filters
-        psraw       mm7,        11                              ; arithmetic shift right 11
-        psllw       mm7,        8                               ; shift left 8 to put it back
-
-        por         mm0,        mm7                             ; put the two together to get result
-
-        psubsb      mm3,        mm0                             ; q0-= q0sz add
-        pxor        mm3,        [GLOBAL(t80)]                   ; unoffset
-
-        ; now do +3 side
-        psubsb      mm5, [GLOBAL(t1s)]                          ; +3 instead of +4
-
-        movq        mm0, mm5                                    ; get a copy of filters
-        psllw       mm0, 8                                      ; shift left 8
-        psraw       mm0, 3                                      ; arithmetic shift right 11
-        psrlw       mm0, 8
-
-        psraw       mm5, 11                                     ; arithmetic shift right 11
-        psllw       mm5, 8                                      ; shift left 8 to put it back
-        por         mm0, mm5                                    ; put the two together to get result
-
-        paddsb      mm6, mm0                                    ; p0+= p0 add
-        pxor        mm6, [GLOBAL(t80)]                          ; unoffset
-
-
-        movq        mm0,        t0
-        movq        mm4,        t1
-
-        ; mm0 = 70 60 50 40 30 20 10 00
-        ; mm6 = 71 61 51 41 31 21 11 01
-        ; mm3 = 72 62 52 42 32 22 12 02
-        ; mm4 = 73 63 53 43 33 23 13 03
-        ; transpose back to write out
-
-        movq        mm1,        mm0                         ;
-        punpcklbw   mm0,        mm6                         ; 31 30 21 20 11 10 01 00
-
-        punpckhbw   mm1,        mm6                         ; 71 70 61 60 51 50 41 40
-        movq        mm2,        mm3                         ;
-
-        punpcklbw   mm2,        mm4                         ; 33 32 23 22 13 12 03 02
-        movq        mm5,        mm1                         ; 71 70 61 60 51 50 41 40
-
-        punpckhbw   mm3,        mm4                         ; 73 72 63 62 53 52 43 42
-        movq        mm6,        mm0                         ; 31 30 21 20 11 10 01 00
-
-        punpcklwd   mm0,        mm2                         ; 13 12 11 10 03 02 01 00
-        punpckhwd   mm6,        mm2                         ; 33 32 31 30 23 22 21 20
-
-        movd        [rsi+rax*4], mm0                        ; write 03 02 01 00
-        punpcklwd   mm1,        mm3                         ; 53 52 51 50 43 42 41 40
-
-        psrlq       mm0,        32                          ; xx xx xx xx 13 12 11 10
-        punpckhwd   mm5,        mm3                         ; 73 72 71 70 63 62 61 60
-
-        movd        [rdi+rax*4], mm0                        ; write 13 12 11 10
-        movd        [rsi+rax*2], mm6                        ; write 23 22 21 20
-
-        psrlq       mm6,        32                          ; 33 32 31 30
-        movd        [rsi],      mm1                         ; write 43 42 41 40
-
-        movd        [rsi + rax], mm6                        ; write 33 32 31 30
-        neg         rax
-
-        movd        [rsi + rax*2], mm5                      ; write 63 62 61 60
-        psrlq       mm1,        32                          ; 53 52 51 50
-
-        movd        [rdi],      mm1                         ; write out 53 52 51 50
-        psrlq       mm5,        32                          ; 73 72 71 70
-
-        movd        [rdi + rax*2], mm5                      ; write 73 72 71 70
-
-        lea         rsi,        [rsi+rax*8]                 ; next 8
-
-        dec         rcx
-        jnz         .nexts8_v
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
-;                  int y_stride,
-;                  loop_filter_info *lfi)
-;{
-;
-;
-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;}
-
 SECTION_RODATA
 align 16
 tfe:
--- a/vp9/common/x86/vp9_loopfilter_sse2.asm
+++ b/vp9/common/x86/vp9_loopfilter_sse2.asm
@@ -845,372 +845,6 @@
     pop         rbp
     ret
 
-;void vp9_loop_filter_simple_horizontal_edge_sse2
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit,
-;)
-global sym(vp9_loop_filter_simple_horizontal_edge_sse2) PRIVATE
-sym(vp9_loop_filter_simple_horizontal_edge_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi, arg(0)             ;src_ptr
-        movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
-        mov         rdx, arg(2)             ;blimit
-        movdqa      xmm3, XMMWORD PTR [rdx]
-
-        mov         rdi, rsi                ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
-        neg         rax
-
-        ; calculate mask
-        movdqa      xmm1, [rsi+2*rax]       ; p1
-        movdqa      xmm0, [rdi]             ; q1
-        movdqa      xmm2, xmm1
-        movdqa      xmm7, xmm0
-        movdqa      xmm4, xmm0
-        psubusb     xmm0, xmm1              ; q1-=p1
-        psubusb     xmm1, xmm4              ; p1-=q1
-        por         xmm1, xmm0              ; abs(p1-q1)
-        pand        xmm1, [GLOBAL(tfe)]     ; set lsb of each byte to zero
-        psrlw       xmm1, 1                 ; abs(p1-q1)/2
-
-        movdqa      xmm5, [rsi+rax]         ; p0
-        movdqa      xmm4, [rsi]             ; q0
-        movdqa      xmm0, xmm4              ; q0
-        movdqa      xmm6, xmm5              ; p0
-        psubusb     xmm5, xmm4              ; p0-=q0
-        psubusb     xmm4, xmm6              ; q0-=p0
-        por         xmm5, xmm4              ; abs(p0 - q0)
-        paddusb     xmm5, xmm5              ; abs(p0-q0)*2
-        paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        xmm3, xmm3
-        pcmpeqb     xmm5, xmm3
-
-        ; start work on filters
-        pxor        xmm2, [GLOBAL(t80)]     ; p1 offset to convert to signed values
-        pxor        xmm7, [GLOBAL(t80)]     ; q1 offset to convert to signed values
-        psubsb      xmm2, xmm7              ; p1 - q1
-
-        pxor        xmm6, [GLOBAL(t80)]     ; offset to convert to signed values
-        pxor        xmm0, [GLOBAL(t80)]     ; offset to convert to signed values
-        movdqa      xmm3, xmm0              ; q0
-        psubsb      xmm0, xmm6              ; q0 - p0
-        paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
-        paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
-        pand        xmm5, xmm2              ; mask filter values we don't care about
-
-        ; do + 4 side
-        paddsb      xmm5, [GLOBAL(t4)]      ; 3* (q0 - p0) + (p1 - q1) + 4
-
-        movdqa      xmm0, xmm5              ; get a copy of filters
-        psllw       xmm0, 8                 ; shift left 8
-        psraw       xmm0, 3                 ; arithmetic shift right 11
-        psrlw       xmm0, 8
-        movdqa      xmm1, xmm5              ; get a copy of filters
-        psraw       xmm1, 11                ; arithmetic shift right 11
-        psllw       xmm1, 8                 ; shift left 8 to put it back
-
-        por         xmm0, xmm1              ; put the two together to get result
-
-        psubsb      xmm3, xmm0              ; q0-= q0 add
-        pxor        xmm3, [GLOBAL(t80)]     ; unoffset
-        movdqa      [rsi], xmm3             ; write back
-
-        ; now do +3 side
-        psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4
-
-        movdqa      xmm0, xmm5              ; get a copy of filters
-        psllw       xmm0, 8                 ; shift left 8
-        psraw       xmm0, 3                 ; arithmetic shift right 11
-        psrlw       xmm0, 8
-        psraw       xmm5, 11                ; arithmetic shift right 11
-        psllw       xmm5, 8                 ; shift left 8 to put it back
-        por         xmm0, xmm5              ; put the two together to get result
-
-
-        paddsb      xmm6, xmm0              ; p0+= p0 add
-        pxor        xmm6, [GLOBAL(t80)]     ; unoffset
-        movdqa      [rsi+rax], xmm6         ; write back
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_loop_filter_simple_vertical_edge_sse2
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit,
-;)
-global sym(vp9_loop_filter_simple_vertical_edge_sse2) PRIVATE
-sym(vp9_loop_filter_simple_vertical_edge_sse2):
-    push        rbp         ; save old base pointer value.
-    mov         rbp, rsp    ; set new base pointer value.
-    SHADOW_ARGS_TO_STACK 3
-    SAVE_XMM 7
-    GET_GOT     rbx         ; save callee-saved reg
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 32                         ; reserve 32 bytes
-    %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
-    %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        lea         rsi,        [rsi - 2 ]
-        lea         rdi,        [rsi + rax]
-        lea         rdx,        [rsi + rax*4]
-        lea         rcx,        [rdx + rax]
-
-        movd        xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
-        movd        xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
-        movd        xmm2,       [rdi]                   ; 13 12 11 10
-        movd        xmm3,       [rcx]                   ; 53 52 51 50
-        punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
-        punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
-
-        movd        xmm4,       [rsi + rax*2]           ; 23 22 21 20
-        movd        xmm5,       [rdx + rax*2]           ; 63 62 61 60
-        movd        xmm6,       [rdi + rax*2]           ; 33 32 31 30
-        movd        xmm7,       [rcx + rax*2]           ; 73 72 71 70
-        punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
-        punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
-
-        punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
-        punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
-
-        movdqa      xmm1,       xmm0
-        punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-        punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-
-        movdqa      xmm2,       xmm0
-        punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-        punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-
-        movdqa      t0,         xmm0                    ; save to t0
-        movdqa      t1,         xmm2                    ; save to t1
-
-        lea         rsi,        [rsi + rax*8]
-        lea         rdi,        [rsi + rax]
-        lea         rdx,        [rsi + rax*4]
-        lea         rcx,        [rdx + rax]
-
-        movd        xmm4,       [rsi]                   ; 83 82 81 80
-        movd        xmm1,       [rdx]                   ; c3 c2 c1 c0
-        movd        xmm6,       [rdi]                   ; 93 92 91 90
-        movd        xmm3,       [rcx]                   ; d3 d2 d1 d0
-        punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
-        punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
-
-        movd        xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0
-        movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
-        movd        xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0
-        movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
-        punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
-        punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
-
-        punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
-        punpcklbw   xmm0,       xmm2                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
-
-        movdqa      xmm1,       xmm4
-        punpcklwd   xmm4,       xmm0                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
-        punpckhwd   xmm1,       xmm0                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
-
-        movdqa      xmm6,       xmm4
-        punpckldq   xmm4,       xmm1                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
-        punpckhdq   xmm6,       xmm1                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
-
-        movdqa      xmm0,       t0                      ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-        movdqa      xmm2,       t1                      ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-        movdqa      xmm1,       xmm0
-        movdqa      xmm3,       xmm2
-
-        punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-        punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-        punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-        punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
-        ; calculate mask
-        movdqa      xmm6,       xmm0                            ; p1
-        movdqa      xmm7,       xmm3                            ; q1
-        psubusb     xmm7,       xmm0                            ; q1-=p1
-        psubusb     xmm6,       xmm3                            ; p1-=q1
-        por         xmm6,       xmm7                            ; abs(p1-q1)
-        pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
-        psrlw       xmm6,       1                               ; abs(p1-q1)/2
-
-        movdqa      xmm5,       xmm1                            ; p0
-        movdqa      xmm4,       xmm2                            ; q0
-        psubusb     xmm5,       xmm2                            ; p0-=q0
-        psubusb     xmm4,       xmm1                            ; q0-=p0
-        por         xmm5,       xmm4                            ; abs(p0 - q0)
-        paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
-        paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        mov         rdx,        arg(2)                          ;blimit
-        movdqa      xmm7, XMMWORD PTR [rdx]
-
-        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        xmm7,        xmm7
-        pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
-
-        ; start work on filters
-        movdqa        t0,        xmm0
-        movdqa        t1,        xmm3
-
-        pxor        xmm0,        [GLOBAL(t80)]                  ; p1 offset to convert to signed values
-        pxor        xmm3,        [GLOBAL(t80)]                  ; q1 offset to convert to signed values
-
-        psubsb      xmm0,        xmm3                           ; p1 - q1
-        movdqa      xmm6,        xmm1                           ; p0
-
-        movdqa      xmm7,        xmm2                           ; q0
-        pxor        xmm6,        [GLOBAL(t80)]                  ; offset to convert to signed values
-
-        pxor        xmm7,        [GLOBAL(t80)]                  ; offset to convert to signed values
-        movdqa      xmm3,        xmm7                           ; offseted ; q0
-
-        psubsb      xmm7,        xmm6                           ; q0 - p0
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 1 * (q0 - p0)
-
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 3 * (q0 - p0)
-
-        pand        xmm5,        xmm0                           ; mask filter values we don't care about
-
-
-        paddsb      xmm5,        [GLOBAL(t4)]                   ;  3* (q0 - p0) + (p1 - q1) + 4
-
-        movdqa      xmm0,        xmm5                           ; get a copy of filters
-        psllw       xmm0,        8                              ; shift left 8
-
-        psraw       xmm0,        3                              ; arithmetic shift right 11
-        psrlw       xmm0,        8
-
-        movdqa      xmm7,        xmm5                           ; get a copy of filters
-        psraw       xmm7,        11                             ; arithmetic shift right 11
-
-        psllw       xmm7,        8                              ; shift left 8 to put it back
-        por         xmm0,        xmm7                           ; put the two together to get result
-
-        psubsb      xmm3,        xmm0                           ; q0-= q0sz add
-        pxor        xmm3,        [GLOBAL(t80)]                  ; unoffset   q0
-
-        ; now do +3 side
-        psubsb      xmm5,        [GLOBAL(t1s)]                  ; +3 instead of +4
-        movdqa      xmm0,        xmm5                           ; get a copy of filters
-
-        psllw       xmm0,        8                              ; shift left 8
-        psraw       xmm0,        3                              ; arithmetic shift right 11
-
-        psrlw       xmm0,        8
-        psraw       xmm5,        11                             ; arithmetic shift right 11
-
-        psllw       xmm5,        8                              ; shift left 8 to put it back
-        por         xmm0,        xmm5                           ; put the two together to get result
-
-        paddsb      xmm6,        xmm0                           ; p0+= p0 add
-        pxor        xmm6,        [GLOBAL(t80)]                  ; unoffset   p0
-
-        movdqa      xmm0,        t0                             ; p1
-        movdqa      xmm4,        t1                             ; q1
-
-        ; transpose back to write out
-        ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-        ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-        ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-        ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-        movdqa      xmm1,       xmm0
-        punpcklbw   xmm0,       xmm6                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
-        punpckhbw   xmm1,       xmm6                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
-
-        movdqa      xmm5,       xmm3
-        punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
-        punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
-        movdqa      xmm2,       xmm0
-        punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
-        punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
-
-        movdqa      xmm3,       xmm1
-        punpcklwd   xmm1,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
-        punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
-
-        ; write out order: xmm0 xmm2 xmm1 xmm3
-        lea         rdx,        [rsi + rax*4]
-
-        movd        [rsi],      xmm1                               ; write the second 8-line result
-        psrldq      xmm1,       4
-        movd        [rdi],      xmm1
-        psrldq      xmm1,       4
-        movd        [rsi + rax*2], xmm1
-        psrldq      xmm1,       4
-        movd        [rdi + rax*2], xmm1
-
-        movd        [rdx],      xmm3
-        psrldq      xmm3,       4
-        movd        [rcx],      xmm3
-        psrldq      xmm3,       4
-        movd        [rdx + rax*2], xmm3
-        psrldq      xmm3,       4
-        movd        [rcx + rax*2], xmm3
-
-        neg         rax
-        lea         rsi,        [rsi + rax*8]
-        neg         rax
-        lea         rdi,        [rsi + rax]
-        lea         rdx,        [rsi + rax*4]
-        lea         rcx,        [rdx + rax]
-
-        movd        [rsi],      xmm0                                ; write the first 8-line result
-        psrldq      xmm0,       4
-        movd        [rdi],      xmm0
-        psrldq      xmm0,       4
-        movd        [rsi + rax*2], xmm0
-        psrldq      xmm0,       4
-        movd        [rdi + rax*2], xmm0
-
-        movd        [rdx],      xmm2
-        psrldq      xmm2,       4
-        movd        [rcx],      xmm2
-        psrldq      xmm2,       4
-        movd        [rdx + rax*2], xmm2
-        psrldq      xmm2,       4
-        movd        [rcx + rax*2], xmm2
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 SECTION_RODATA
 align 16
 tfe:
--- a/vp9/common/x86/vp9_loopfilter_x86.h
+++ b/vp9/common/x86/vp9_loopfilter_x86.h
@@ -23,10 +23,6 @@
 extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx);
 extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx);
 extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_mmx);
 #endif
 
 #if HAVE_SSE2
@@ -34,10 +30,6 @@
 extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2);
 extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2);
 extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_sse2);
 #endif
 
 #endif  // LOOPFILTER_X86_H
--- a/vp9/common/x86/vp9_recon_mmx.asm
+++ b/vp9/common/x86/vp9_recon_mmx.asm
@@ -10,55 +10,6 @@
 
 
 %include "vpx_ports/x86_abi_support.asm"
-;void vp9_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp9_recon_b_mmx) PRIVATE
-sym(vp9_recon_b_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov       rsi, arg(0) ;s
-        mov       rdi, arg(2) ;d
-        mov       rdx, arg(1) ;q
-        movsxd    rax, dword ptr arg(3) ;stride
-        pxor      mm0, mm0
-
-        movd      mm1, [rsi]
-        punpcklbw mm1, mm0
-        paddsw    mm1, [rdx]
-        packuswb  mm1,  mm0              ; pack and unpack to saturate
-        movd      [rdi], mm1
-
-        movd      mm2, [rsi+16]
-        punpcklbw mm2, mm0
-        paddsw    mm2, [rdx+32]
-        packuswb  mm2, mm0              ; pack and unpack to saturate
-        movd      [rdi+rax], mm2
-
-        movd      mm3, [rsi+32]
-        punpcklbw mm3, mm0
-        paddsw    mm3, [rdx+64]
-        packuswb  mm3,  mm0              ; pack and unpack to saturate
-        movd      [rdi+2*rax], mm3
-
-        add       rdi, rax
-        movd      mm4, [rsi+48]
-        punpcklbw mm4, mm0
-        paddsw    mm4, [rdx+96]
-        packuswb  mm4, mm0              ; pack and unpack to saturate
-        movd      [rdi+2*rax], mm4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 ;void copy_mem8x8_mmx(
 ;    unsigned char *src,
 ;    int src_stride,
--- a/vp9/common/x86/vp9_recon_sse2.asm
+++ b/vp9/common/x86/vp9_recon_sse2.asm
@@ -10,122 +10,6 @@
 
 
 %include "vpx_ports/x86_abi_support.asm"
-;void vp9_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp9_recon2b_sse2) PRIVATE
-sym(vp9_recon2b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;s
-        mov         rdi,        arg(2) ;d
-        mov         rdx,        arg(1) ;q
-        movsxd      rax,        dword ptr arg(3) ;stride
-        pxor        xmm0,       xmm0
-
-        movq        xmm1,       MMWORD PTR [rsi]
-        punpcklbw   xmm1,       xmm0
-        paddsw      xmm1,       XMMWORD PTR [rdx]
-        packuswb    xmm1,       xmm0              ; pack and unpack to saturate
-        movq        MMWORD PTR [rdi],   xmm1
-
-
-        movq        xmm2,       MMWORD PTR [rsi+8]
-        punpcklbw   xmm2,       xmm0
-        paddsw      xmm2,       XMMWORD PTR [rdx+16]
-        packuswb    xmm2,       xmm0              ; pack and unpack to saturate
-        movq        MMWORD PTR [rdi+rax],   xmm2
-
-
-        movq        xmm3,       MMWORD PTR [rsi+16]
-        punpcklbw   xmm3,       xmm0
-        paddsw      xmm3,       XMMWORD PTR [rdx+32]
-        packuswb    xmm3,       xmm0              ; pack and unpack to saturate
-        movq        MMWORD PTR [rdi+rax*2], xmm3
-
-        add         rdi, rax
-        movq        xmm4,       MMWORD PTR [rsi+24]
-        punpcklbw   xmm4,       xmm0
-        paddsw      xmm4,       XMMWORD PTR [rdx+48]
-        packuswb    xmm4,       xmm0              ; pack and unpack to saturate
-        movq        MMWORD PTR [rdi+rax*2], xmm4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp9_recon4b_sse2) PRIVATE
-sym(vp9_recon4b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;s
-        mov         rdi,        arg(2) ;d
-        mov         rdx,        arg(1) ;q
-        movsxd      rax,        dword ptr arg(3) ;stride
-        pxor        xmm0,       xmm0
-
-        movdqa      xmm1,       XMMWORD PTR [rsi]
-        movdqa      xmm5,       xmm1
-        punpcklbw   xmm1,       xmm0
-        punpckhbw   xmm5,       xmm0
-        paddsw      xmm1,       XMMWORD PTR [rdx]
-        paddsw      xmm5,       XMMWORD PTR [rdx+16]
-        packuswb    xmm1,       xmm5              ; pack and unpack to saturate
-        movdqa      XMMWORD PTR [rdi],  xmm1
-
-
-        movdqa      xmm2,       XMMWORD PTR [rsi+16]
-        movdqa      xmm6,       xmm2
-        punpcklbw   xmm2,       xmm0
-        punpckhbw   xmm6,       xmm0
-        paddsw      xmm2,       XMMWORD PTR [rdx+32]
-        paddsw      xmm6,       XMMWORD PTR [rdx+48]
-        packuswb    xmm2,       xmm6              ; pack and unpack to saturate
-        movdqa      XMMWORD PTR [rdi+rax],  xmm2
-
-
-        movdqa      xmm3,       XMMWORD PTR [rsi+32]
-        movdqa      xmm7,       xmm3
-        punpcklbw   xmm3,       xmm0
-        punpckhbw   xmm7,       xmm0
-        paddsw      xmm3,       XMMWORD PTR [rdx+64]
-        paddsw      xmm7,       XMMWORD PTR [rdx+80]
-        packuswb    xmm3,       xmm7              ; pack and unpack to saturate
-        movdqa      XMMWORD PTR [rdi+rax*2],    xmm3
-
-        add       rdi, rax
-        movdqa      xmm4,       XMMWORD PTR [rsi+48]
-        movdqa      xmm5,       xmm4
-        punpcklbw   xmm4,       xmm0
-        punpckhbw   xmm5,       xmm0
-        paddsw      xmm4,       XMMWORD PTR [rdx+96]
-        paddsw      xmm5,       XMMWORD PTR [rdx+112]
-        packuswb    xmm4,       xmm5              ; pack and unpack to saturate
-        movdqa      XMMWORD PTR [rdi+rax*2],    xmm4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 ;void copy_mem16x16_sse2(
 ;    unsigned char *src,
 ;    int src_stride,
--- a/vp9/common/x86/vp9_recon_wrapper_sse2.c
+++ b/vp9/common/x86/vp9_recon_wrapper_sse2.c
@@ -35,7 +35,7 @@
                                             build_intra_pred_mbuv_fn_t ho_fn) {
   int mode = xd->mode_info_context->mbmi.uv_mode;
   build_intra_pred_mbuv_fn_t fn;
-  int src_stride = xd->dst.uv_stride;
+  int src_stride = xd->plane[1].dst.stride;
 
   switch (mode) {
     case  V_PRED:
@@ -68,34 +68,34 @@
       return;
   }
 
-  fn(dst_u, dst_stride, xd->dst.u_buffer, src_stride);
-  fn(dst_v, dst_stride, xd->dst.v_buffer, src_stride);
+  fn(dst_u, dst_stride, xd->plane[1].dst.buf, src_stride);
+  fn(dst_v, dst_stride, xd->plane[2].dst.buf, src_stride);
 }
 
 void vp9_build_intra_predictors_mbuv_sse2(MACROBLOCKD *xd) {
-  build_intra_predictors_mbuv_x86(xd, &xd->predictor[256],
-                                  &xd->predictor[320], 8,
+  build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf,
+                                  xd->plane[2].dst.buf, xd->plane[1].dst.stride,
                                   vp9_intra_pred_uv_tm_sse2,
                                   vp9_intra_pred_uv_ho_mmx2);
 }
 
 void vp9_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *xd) {
-  build_intra_predictors_mbuv_x86(xd, &xd->predictor[256],
-                                  &xd->predictor[320], 8,
+  build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf,
+                                  xd->plane[2].dst.buf, xd->plane[1].dst.stride,
                                   vp9_intra_pred_uv_tm_ssse3,
                                   vp9_intra_pred_uv_ho_ssse3);
 }
 
 void vp9_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *xd) {
-  build_intra_predictors_mbuv_x86(xd, xd->dst.u_buffer,
-                                  xd->dst.v_buffer, xd->dst.uv_stride,
+  build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf,
+                                  xd->plane[2].dst.buf, xd->plane[1].dst.stride,
                                   vp9_intra_pred_uv_tm_sse2,
                                   vp9_intra_pred_uv_ho_mmx2);
 }
 
 void vp9_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *xd) {
-  build_intra_predictors_mbuv_x86(xd, xd->dst.u_buffer,
-                                  xd->dst.v_buffer, xd->dst.uv_stride,
+  build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf,
+                                  xd->plane[2].dst.buf, xd->plane[1].dst.stride,
                                   vp9_intra_pred_uv_tm_ssse3,
                                   vp9_intra_pred_uv_ho_ssse3);
 }
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -81,10 +81,10 @@
     pmaddubsw   xmm4, k4k5
     pmaddubsw   xmm6, k6k7
 
+    paddsw      xmm0, xmm6
     paddsw      xmm0, xmm2
-    paddsw      xmm0, krd
-    paddsw      xmm4, xmm6
     paddsw      xmm0, xmm4
+    paddsw      xmm0, krd
 
     psraw       xmm0, 7
     packuswb    xmm0, xmm0
@@ -165,10 +165,10 @@
     pmaddubsw   xmm4, k4k5
     pmaddubsw   xmm6, k6k7
 
+    paddsw      xmm0, xmm6
     paddsw      xmm0, xmm2
-    paddsw      xmm0, krd
-    paddsw      xmm4, xmm6
     paddsw      xmm0, xmm4
+    paddsw      xmm0, krd
 
     psraw       xmm0, 7
     packuswb    xmm0, xmm0
@@ -250,10 +250,10 @@
     pmaddubsw   xmm4, k4k5
     pmaddubsw   xmm6, k6k7
 
+    paddsw      xmm0, xmm6
     paddsw      xmm0, xmm2
-    paddsw      xmm0, krd
-    paddsw      xmm4, xmm6
     paddsw      xmm0, xmm4
+    paddsw      xmm0, krd
 
     psraw       xmm0, 7
     packuswb    xmm0, xmm0
@@ -285,10 +285,10 @@
     pmaddubsw   xmm4, k4k5
     pmaddubsw   xmm6, k6k7
 
+    paddsw      xmm0, xmm6
     paddsw      xmm0, xmm2
-    paddsw      xmm4, xmm6
-    paddsw      xmm0, krd
     paddsw      xmm0, xmm4
+    paddsw      xmm0, krd
 
     psraw       xmm0, 7
     packuswb    xmm0, xmm0
--- a/vp9/common/x86/vp9_subpixel_variance_sse2.c
+++ b/vp9/common/x86/vp9_subpixel_variance_sse2.c
@@ -43,48 +43,3 @@
                                      int  yoffset,
                                      int *sum,
                                      unsigned int *sumsquared);
-
-unsigned int vp9_sub_pixel_variance16x2_sse2(const unsigned char  *src_ptr,
-                                             int  src_pixels_per_line,
-                                             int  xoffset,
-                                             int  yoffset,
-                                             const unsigned char *dst_ptr,
-                                             int dst_pixels_per_line,
-                                             unsigned int *sse) {
-  int xsum0, xsum1;
-  unsigned int xxsum0, xxsum1;
-
-  if (xoffset == HALFNDX && yoffset == 0) {
-    vp9_half_horiz_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 2,
-      &xsum0, &xxsum0);
-  } else if (xoffset == 0 && yoffset == HALFNDX) {
-    vp9_half_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 2,
-      &xsum0, &xxsum0);
-  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
-    vp9_half_horiz_vert_variance16x_h_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 2,
-      &xsum0, &xxsum0);
-  } else {
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr, src_pixels_per_line,
-      dst_ptr, dst_pixels_per_line, 2,
-      xoffset, yoffset,
-      &xsum0, &xxsum0);
-
-    vp9_filter_block2d_bil_var_sse2(
-      src_ptr + 8, src_pixels_per_line,
-      dst_ptr + 8, dst_pixels_per_line, 2,
-      xoffset, yoffset,
-      &xsum1, &xxsum1);
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-  }
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 5));
-}
--- a/vp9/decoder/vp9_dboolhuff.c
+++ b/vp9/decoder/vp9_dboolhuff.c
@@ -13,34 +13,32 @@
 
 #include "vp9/decoder/vp9_dboolhuff.h"
 
-int vp9_start_decode(BOOL_DECODER *br,
-                     const unsigned char *source,
-                     unsigned int source_sz) {
-  br->user_buffer_end = source + source_sz;
-  br->user_buffer = source;
-  br->value = 0;
-  br->count = -8;
-  br->range = 255;
+int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size) {
+  int marker_bit;
 
-  if (source_sz && !source)
+  r->buffer_end = buffer + size;
+  r->buffer = buffer;
+  r->value = 0;
+  r->count = -8;
+  r->range = 255;
+
+  if (size && !buffer)
     return 1;
 
-  /* Populate the buffer */
-  vp9_bool_decoder_fill(br);
-
-  return 0;
+  vp9_reader_fill(r);
+  marker_bit = vp9_read_bit(r);
+  return marker_bit != 0;
 }
 
-
-void vp9_bool_decoder_fill(BOOL_DECODER *br) {
-  const unsigned char *bufptr = br->user_buffer;
-  const unsigned char *bufend = br->user_buffer_end;
-  VP9_BD_VALUE value = br->value;
-  int count = br->count;
+void vp9_reader_fill(vp9_reader *r) {
+  const uint8_t *const buffer_end = r->buffer_end;
+  const uint8_t *buffer = r->buffer;
+  VP9_BD_VALUE value = r->value;
+  int count = r->count;
   int shift = VP9_BD_VALUE_SIZE - 8 - (count + 8);
   int loop_end = 0;
-  int bits_left = (int)((bufend - bufptr)*CHAR_BIT);
-  int x = shift + CHAR_BIT - bits_left;
+  const int bits_left = (int)((buffer_end - buffer)*CHAR_BIT);
+  const int x = shift + CHAR_BIT - bits_left;
 
   if (x >= 0) {
     count += VP9_LOTS_OF_BITS;
@@ -50,79 +48,22 @@
   if (x < 0 || bits_left) {
     while (shift >= loop_end) {
       count += CHAR_BIT;
-      value |= (VP9_BD_VALUE)*bufptr++ << shift;
+      value |= (VP9_BD_VALUE)*buffer++ << shift;
       shift -= CHAR_BIT;
     }
   }
 
-  br->user_buffer = bufptr;
-  br->value = value;
-  br->count = count;
+  r->buffer = buffer;
+  r->value = value;
+  r->count = count;
 }
 
-
-static int get_unsigned_bits(unsigned num_values) {
-  int cat = 0;
-  if (num_values <= 1)
-    return 0;
-  num_values--;
-  while (num_values > 0) {
-    cat++;
-    num_values >>= 1;
+const uint8_t *vp9_reader_find_end(vp9_reader *r) {
+  // Find the end of the coded buffer
+  while (r->count > CHAR_BIT && r->count < VP9_BD_VALUE_SIZE) {
+    r->count -= CHAR_BIT;
+    r->buffer--;
   }
-  return cat;
+  return r->buffer;
 }
 
-int vp9_inv_recenter_nonneg(int v, int m) {
-  if (v > (m << 1))
-    return v;
-  else if ((v & 1) == 0)
-    return (v >> 1) + m;
-  else
-    return m - ((v + 1) >> 1);
-}
-
-int vp9_decode_uniform(BOOL_DECODER *br, int n) {
-  int v;
-  int l = get_unsigned_bits(n);
-  int m = (1 << l) - n;
-  if (!l) return 0;
-  v = decode_value(br, l - 1);
-  if (v < m)
-    return v;
-  else
-    return (v << 1) - m + decode_value(br, 1);
-}
-
-int vp9_decode_term_subexp(BOOL_DECODER *br, int k, int num_syms) {
-  int i = 0, mk = 0, word;
-  while (1) {
-    int b = (i ? k + i - 1 : k);
-    int a = (1 << b);
-    if (num_syms <= mk + 3 * a) {
-      word = vp9_decode_uniform(br, num_syms - mk) + mk;
-      break;
-    } else {
-      if (decode_value(br, 1)) {
-        i++;
-        mk += a;
-      } else {
-        word = decode_value(br, b) + mk;
-        break;
-      }
-    }
-  }
-  return word;
-}
-
-int vp9_decode_unsigned_max(BOOL_DECODER *br, int max) {
-  int data = 0, bit = 0, lmax = max;
-
-  while (lmax) {
-    data |= decode_bool(br, 128) << bit++;
-    lmax >>= 1;
-  }
-  if (data > max)
-    return max;
-  return data;
-}
--- a/vp9/decoder/vp9_dboolhuff.h
+++ b/vp9/decoder/vp9_dboolhuff.h
@@ -21,32 +21,29 @@
 typedef size_t VP9_BD_VALUE;
 
 #define VP9_BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT)
-/*This is meant to be a large, positive constant that can still be efficiently
-   loaded as an immediate (on platforms like ARM, for example).
-  Even relatively modest values like 100 would work fine.*/
-#define VP9_LOTS_OF_BITS (0x40000000)
 
+// This is meant to be a large, positive constant that can still be efficiently
+// loaded as an immediate (on platforms like ARM, for example).
+// Even relatively modest values like 100 would work fine.
+#define VP9_LOTS_OF_BITS 0x40000000
+
 typedef struct {
-  const unsigned char *user_buffer_end;
-  const unsigned char *user_buffer;
-  VP9_BD_VALUE         value;
-  int                  count;
-  unsigned int         range;
-} BOOL_DECODER;
+  const uint8_t *buffer_end;
+  const uint8_t *buffer;
+  VP9_BD_VALUE value;
+  int count;
+  unsigned int range;
+} vp9_reader;
 
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
 
-int vp9_start_decode(BOOL_DECODER *br,
-                     const unsigned char *source,
-                     unsigned int source_sz);
+int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size);
 
-void vp9_bool_decoder_fill(BOOL_DECODER *br);
+void vp9_reader_fill(vp9_reader *r);
 
-int vp9_decode_uniform(BOOL_DECODER *br, int n);
-int vp9_decode_term_subexp(BOOL_DECODER *br, int k, int num_syms);
-int vp9_inv_recenter_nonneg(int v, int m);
+const uint8_t *vp9_reader_find_end(vp9_reader *r);
 
-static int decode_bool(BOOL_DECODER *br, int probability) {
+static int vp9_read(vp9_reader *br, int probability) {
   unsigned int bit = 0;
   VP9_BD_VALUE value;
   VP9_BD_VALUE bigsplit;
@@ -55,7 +52,7 @@
   unsigned int split = 1 + (((br->range - 1) * probability) >> 8);
 
   if (br->count < 0)
-    vp9_bool_decoder_fill(br);
+    vp9_reader_fill(br);
 
   value = br->value;
   count = br->count;
@@ -83,18 +80,20 @@
   return bit;
 }
 
-static int decode_value(BOOL_DECODER *br, int bits) {
-  int z = 0;
-  int bit;
+static int vp9_read_bit(vp9_reader *r) {
+  return vp9_read(r, 128);  // vp9_prob_half
+}
 
-  for (bit = bits - 1; bit >= 0; bit--) {
-    z |= decode_bool(br, 0x80) << bit;
-  }
+static int vp9_read_literal(vp9_reader *br, int bits) {
+  int z = 0, bit;
 
+  for (bit = bits - 1; bit >= 0; bit--)
+    z |= vp9_read_bit(br) << bit;
+
   return z;
 }
 
-static int bool_error(BOOL_DECODER *br) {
+static int vp9_reader_has_error(vp9_reader *r) {
   // Check if we have reached the end of the buffer.
   //
   // Variable 'count' stores the number of bits in the 'value' buffer, minus
@@ -109,9 +108,7 @@
   //
   // 1 if we have tried to decode bits after the end of stream was encountered.
   // 0 No error.
-  return br->count > VP9_BD_VALUE_SIZE && br->count < VP9_LOTS_OF_BITS;
+  return r->count > VP9_BD_VALUE_SIZE && r->count < VP9_LOTS_OF_BITS;
 }
-
-int vp9_decode_unsigned_max(BOOL_DECODER *br, int max);
 
 #endif  // VP9_DECODER_VP9_DBOOLHUFF_H_
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -20,6 +20,7 @@
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/decoder/vp9_decodemv.h"
+#include "vp9/decoder/vp9_decodframe.h"
 #include "vp9/common/vp9_mvref_common.h"
 #if CONFIG_DEBUG
 #include <assert.h>
@@ -35,202 +36,138 @@
 extern int dec_debug;
 #endif
 
-static B_PREDICTION_MODE read_bmode(vp9_reader *bc, const vp9_prob *p) {
-  B_PREDICTION_MODE m = treed_read(bc, vp9_bmode_tree, p);
-#if CONFIG_NEWBINTRAMODES
-  if (m == B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS)
-    m = B_CONTEXT_PRED;
-  assert(m < B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS || m == B_CONTEXT_PRED);
-#endif
+static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
+  MB_PREDICTION_MODE m = treed_read(r, vp9_intra_mode_tree, p);
   return m;
 }
 
-static B_PREDICTION_MODE read_kf_bmode(vp9_reader *bc, const vp9_prob *p) {
-  return (B_PREDICTION_MODE)treed_read(bc, vp9_kf_bmode_tree, p);
+static int read_mb_segid(vp9_reader *r, MACROBLOCKD *xd) {
+  return treed_read(r, vp9_segment_tree, xd->mb_segment_tree_probs);
 }
 
-static MB_PREDICTION_MODE read_ymode(vp9_reader *bc, const vp9_prob *p) {
-  return (MB_PREDICTION_MODE)treed_read(bc, vp9_ymode_tree, p);
-}
+static void set_segment_id(VP9_COMMON *cm, MB_MODE_INFO *mbmi,
+                           int mi_row, int mi_col, int segment_id) {
+  const int mi_index = mi_row * cm->mi_cols + mi_col;
+  const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
+  const int bw = 1 << mi_width_log2(sb_type);
+  const int bh = 1 << mi_height_log2(sb_type);
+  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+  const int xmis = MIN(cm->mi_cols - mi_col, bw);
+  int x, y;
 
-static MB_PREDICTION_MODE read_sb_ymode(vp9_reader *bc, const vp9_prob *p) {
-  return (MB_PREDICTION_MODE)treed_read(bc, vp9_sb_ymode_tree, p);
-}
-
-static MB_PREDICTION_MODE read_kf_sb_ymode(vp9_reader *bc, const vp9_prob *p) {
-  return (MB_PREDICTION_MODE)treed_read(bc, vp9_uv_mode_tree, p);
-}
-
-static MB_PREDICTION_MODE read_kf_mb_ymode(vp9_reader *bc, const vp9_prob *p) {
-  return (MB_PREDICTION_MODE)treed_read(bc, vp9_kf_ymode_tree, p);
-}
-
-static int read_i8x8_mode(vp9_reader *bc, const vp9_prob *p) {
-  return treed_read(bc, vp9_i8x8_mode_tree, p);
-}
-
-static MB_PREDICTION_MODE read_uv_mode(vp9_reader *bc, const vp9_prob *p) {
-  return (MB_PREDICTION_MODE)treed_read(bc, vp9_uv_mode_tree, p);
-}
-
-// This function reads the current macro block's segnent id from the bitstream
-// It should only be called if a segment map update is indicated.
-static void read_mb_segid(vp9_reader *r, MB_MODE_INFO *mi, MACROBLOCKD *xd) {
-  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
-    const vp9_prob *const p = xd->mb_segment_tree_probs;
-    mi->segment_id = vp9_read(r, p[0]) ? 2 + vp9_read(r, p[2])
-                                       : vp9_read(r, p[1]);
+  for (y = 0; y < ymis; y++) {
+    for (x = 0; x < xmis; x++) {
+      const int index = mi_index + (y * cm->mi_cols + x);
+      cm->last_frame_seg_map[index] = segment_id;
+    }
   }
 }
 
-// This function reads the current macro block's segnent id from the bitstream
-// It should only be called if a segment map update is indicated.
-static void read_mb_segid_except(VP9_COMMON *cm,
-                                 vp9_reader *r, MB_MODE_INFO *mi,
-                                 MACROBLOCKD *xd, int mb_row, int mb_col) {
-  const int mb_index = mb_row * cm->mb_cols + mb_col;
-  const int pred_seg_id = vp9_get_pred_mb_segid(cm, xd, mb_index);
-  const vp9_prob *const p = xd->mb_segment_tree_probs;
-  const vp9_prob prob = xd->mb_segment_mispred_tree_probs[pred_seg_id];
-
-  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
-    mi->segment_id = vp9_read(r, prob)
-        ? 2 + (pred_seg_id  < 2 ? vp9_read(r, p[2]) : (pred_seg_id == 2))
-        :     (pred_seg_id >= 2 ? vp9_read(r, p[1]) : (pred_seg_id == 0));
+static TX_SIZE select_txfm_size(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
+  const int context = vp9_get_pred_context(cm, xd, PRED_TX_SIZE);
+  const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE);
+  TX_SIZE txfm_size = vp9_read(r, tx_probs[0]);
+  if (txfm_size != TX_4X4 && bsize >= BLOCK_SIZE_MB16X16) {
+    txfm_size += vp9_read(r, tx_probs[1]);
+    if (txfm_size != TX_8X8 && bsize >= BLOCK_SIZE_SB32X32)
+      txfm_size += vp9_read(r, tx_probs[2]);
   }
-}
-
-#if CONFIG_NEW_MVREF
-int vp9_read_mv_ref_id(vp9_reader *r, vp9_prob *ref_id_probs) {
-  int ref_index = 0;
-
-  if (vp9_read(r, ref_id_probs[0])) {
-    ref_index++;
-    if (vp9_read(r, ref_id_probs[1])) {
-      ref_index++;
-      if (vp9_read(r, ref_id_probs[2]))
-        ref_index++;
-    }
+  if (bsize >= BLOCK_SIZE_SB32X32) {
+    cm->fc.tx_count_32x32p[context][txfm_size]++;
+  } else if (bsize >= BLOCK_SIZE_MB16X16) {
+    cm->fc.tx_count_16x16p[context][txfm_size]++;
+  } else {
+    cm->fc.tx_count_8x8p[context][txfm_size]++;
   }
-  return ref_index;
+  return txfm_size;
 }
-#endif
 
-extern const int vp9_i8x8_block[4];
-static void kfread_modes(VP9D_COMP *pbi,
-                         MODE_INFO *m,
-                         int mb_row,
-                         int mb_col,
-                         BOOL_DECODER* const bc) {
+
+static void kfread_modes(VP9D_COMP *pbi, MODE_INFO *m,
+                         int mi_row, int mi_col,
+                         vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
-  MACROBLOCKD *const xd  = &pbi->mb;
-  const int mis = pbi->common.mode_info_stride;
-  int map_index = mb_row * pbi->common.mb_cols + mb_col;
-  MB_PREDICTION_MODE y_mode;
+  MACROBLOCKD *const xd = &pbi->mb;
+  const int mis = cm->mode_info_stride;
 
-  m->mbmi.ref_frame = INTRA_FRAME;
-
-  // Read the Macroblock segmentation map if it is being updated explicitly
-  // this frame (reset to 0 by default).
+  // Read segmentation map if it is being updated explicitly this frame
   m->mbmi.segment_id = 0;
-  if (pbi->mb.update_mb_segmentation_map) {
-    read_mb_segid(bc, &m->mbmi, &pbi->mb);
-    if (m->mbmi.sb_type) {
-      const int nmbs = 1 << m->mbmi.sb_type;
-      const int ymbs = MIN(cm->mb_rows - mb_row, nmbs);
-      const int xmbs = MIN(cm->mb_cols - mb_col, nmbs);
-      int x, y;
-
-      for (y = 0; y < ymbs; y++) {
-        for (x = 0; x < xmbs; x++) {
-          cm->last_frame_seg_map[map_index + x + y * cm->mb_cols] =
-              m->mbmi.segment_id;
-        }
-      }
-    } else {
-      cm->last_frame_seg_map[map_index] = m->mbmi.segment_id;
-    }
+  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
+    m->mbmi.segment_id = read_mb_segid(r, xd);
+    set_segment_id(cm, &m->mbmi, mi_row, mi_col, m->mbmi.segment_id);
   }
 
-  m->mbmi.mb_skip_coeff = 0;
-  if (pbi->common.mb_no_coeff_skip &&
-      (!vp9_segfeature_active(&pbi->mb, m->mbmi.segment_id, SEG_LVL_SKIP))) {
-    m->mbmi.mb_skip_coeff = vp9_read(bc, vp9_get_pred_prob(cm, &pbi->mb,
-                                                           PRED_MBSKIP));
-  } else {
-    m->mbmi.mb_skip_coeff = vp9_segfeature_active(&pbi->mb, m->mbmi.segment_id,
-                                                  SEG_LVL_SKIP);
+  m->mbmi.mb_skip_coeff = vp9_segfeature_active(xd, m->mbmi.segment_id,
+                                                SEG_LVL_SKIP);
+  if (!m->mbmi.mb_skip_coeff) {
+    m->mbmi.mb_skip_coeff = vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
+    cm->fc.mbskip_count[vp9_get_pred_context(cm, xd, PRED_MBSKIP)]
+                       [m->mbmi.mb_skip_coeff]++;
   }
 
-  y_mode = m->mbmi.sb_type ?
-      read_kf_sb_ymode(bc,
-          pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]):
-      read_kf_mb_ymode(bc,
-          pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
-
-  m->mbmi.ref_frame = INTRA_FRAME;
-
-  if ((m->mbmi.mode = y_mode) == B_PRED) {
-    int i = 0;
-    do {
-      const B_PREDICTION_MODE a = above_block_mode(m, i, mis);
-      const B_PREDICTION_MODE l = (xd->left_available || (i & 3)) ?
-                                  left_block_mode(m, i) : B_DC_PRED;
-
-      m->bmi[i].as_mode.first = read_kf_bmode(bc,
-                                              pbi->common.kf_bmode_prob[a][l]);
-    } while (++i < 16);
-  }
-
-  if ((m->mbmi.mode = y_mode) == I8X8_PRED) {
-    int i;
-    for (i = 0; i < 4; i++) {
-      const int ib = vp9_i8x8_block[i];
-      const int mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);
-
-      m->bmi[ib + 0].as_mode.first = mode8x8;
-      m->bmi[ib + 1].as_mode.first = mode8x8;
-      m->bmi[ib + 4].as_mode.first = mode8x8;
-      m->bmi[ib + 5].as_mode.first = mode8x8;
-    }
-  } else {
-    m->mbmi.uv_mode = read_uv_mode(bc,
-                                   pbi->common.kf_uv_mode_prob[m->mbmi.mode]);
-  }
-
   if (cm->txfm_mode == TX_MODE_SELECT &&
-      m->mbmi.mb_skip_coeff == 0 &&
-      m->mbmi.mode <= I8X8_PRED) {
-    // FIXME(rbultje) code ternary symbol once all experiments are merged
-    m->mbmi.txfm_size = vp9_read(bc, cm->prob_tx[0]);
-    if (m->mbmi.txfm_size != TX_4X4 && m->mbmi.mode != I8X8_PRED) {
-      m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[1]);
-      if (m->mbmi.txfm_size != TX_8X8 && m->mbmi.sb_type)
-        m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[2]);
-    }
-  } else if (cm->txfm_mode >= ALLOW_32X32 && m->mbmi.sb_type) {
+      m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
+    m->mbmi.txfm_size = select_txfm_size(cm, xd, r, m->mbmi.sb_type);
+  } else if (cm->txfm_mode >= ALLOW_32X32 &&
+             m->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
     m->mbmi.txfm_size = TX_32X32;
-  } else if (cm->txfm_mode >= ALLOW_16X16 && m->mbmi.mode <= TM_PRED) {
+  } else if (cm->txfm_mode >= ALLOW_16X16 &&
+             m->mbmi.sb_type >= BLOCK_SIZE_MB16X16) {
     m->mbmi.txfm_size = TX_16X16;
-  } else if (cm->txfm_mode >= ALLOW_8X8 && m->mbmi.mode != B_PRED) {
+  } else if (cm->txfm_mode >= ALLOW_8X8 &&
+             m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
     m->mbmi.txfm_size = TX_8X8;
   } else {
     m->mbmi.txfm_size = TX_4X4;
   }
+
+  // luma mode
+  m->mbmi.ref_frame[0] = INTRA_FRAME;
+  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
+    const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis);
+    const MB_PREDICTION_MODE L = xd->left_available ?
+                                  left_block_mode(m, 0) : DC_PRED;
+    m->mbmi.mode = read_intra_mode(r, cm->kf_y_mode_prob[A][L]);
+  } else {
+    int idx, idy;
+    int bw = 1 << b_width_log2(m->mbmi.sb_type);
+    int bh = 1 << b_height_log2(m->mbmi.sb_type);
+
+    for (idy = 0; idy < 2; idy += bh) {
+      for (idx = 0; idx < 2; idx += bw) {
+        int ib = idy * 2 + idx;
+        int k;
+        const MB_PREDICTION_MODE A = above_block_mode(m, ib, mis);
+        const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
+                                      left_block_mode(m, ib) : DC_PRED;
+        m->bmi[ib].as_mode.first =
+            read_intra_mode(r, cm->kf_y_mode_prob[A][L]);
+        for (k = 1; k < bh; ++k)
+          m->bmi[ib + k * 2].as_mode.first = m->bmi[ib].as_mode.first;
+        for (k = 1; k < bw; ++k)
+          m->bmi[ib + k].as_mode.first = m->bmi[ib].as_mode.first;
+      }
+    }
+    m->mbmi.mode = m->bmi[3].as_mode.first;
+  }
+
+  m->mbmi.uv_mode = read_intra_mode(r, cm->kf_uv_mode_prob[m->mbmi.mode]);
 }
 
-static int read_nmv_component(vp9_reader *r,
-                              int rv,
-                              const nmv_component *mvcomp) {
-  int mag, d;
+static int read_mv_component(vp9_reader *r,
+                             const nmv_component *mvcomp, int usehp) {
+
+  int mag, d, fr, hp;
   const int sign = vp9_read(r, mvcomp->sign);
   const int mv_class = treed_read(r, vp9_mv_class_tree, mvcomp->classes);
 
+  // Integer part
   if (mv_class == MV_CLASS_0) {
     d = treed_read(r, vp9_mv_class0_tree, mvcomp->class0);
   } else {
     int i;
-    int n = mv_class + CLASS0_BITS - 1;  // number of bits
+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
 
     d = 0;
     for (i = 0; i < n; ++i)
@@ -237,235 +174,125 @@
       d |= vp9_read(r, mvcomp->bits[i]) << i;
   }
 
-  mag = vp9_get_mv_mag(mv_class, d << 3);
-  return sign ? -(mag + 8) : (mag + 8);
-}
+  // Fractional part
+  fr = treed_read(r, vp9_mv_fp_tree,
+                  mv_class == MV_CLASS_0 ? mvcomp->class0_fp[d] : mvcomp->fp);
 
-static int read_nmv_component_fp(vp9_reader *r,
-                                 int v,
-                                 int rv,
-                                 const nmv_component *mvcomp,
-                                 int usehp) {
-  const int sign = v < 0;
-  int mag = ((sign ? -v : v) - 1) & ~7;  // magnitude - 1
-  int offset;
-  const int mv_class = vp9_get_mv_class(mag, &offset);
-  const int f = mv_class == MV_CLASS_0 ?
-      treed_read(r, vp9_mv_fp_tree, mvcomp->class0_fp[offset >> 3]):
-      treed_read(r, vp9_mv_fp_tree, mvcomp->fp);
 
-  offset += f << 1;
+  // High precision part (if hp is not used, the default value of the hp is 1)
+  hp = usehp ? vp9_read(r,
+                        mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp)
+             : 1;
 
-  if (usehp) {
-    const vp9_prob p = mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp;
-    offset += vp9_read(r, p);
-  } else {
-    offset += 1;  // If hp is not used, the default value of the hp bit is 1
-  }
-  mag = vp9_get_mv_mag(mv_class, offset);
-  return sign ? -(mag + 1) : (mag + 1);
+  // result
+  mag = vp9_get_mv_mag(mv_class, (d << 3) | (fr << 1) | hp) + 1;
+  return sign ? -mag : mag;
 }
 
-static void read_nmv(vp9_reader *r, MV *mv, const MV *ref,
-                     const nmv_context *mvctx) {
-  const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, mvctx->joints);
-  mv->row = mv-> col = 0;
-  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
-    mv->row = read_nmv_component(r, ref->row, &mvctx->comps[0]);
-  }
-
-  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
-    mv->col = read_nmv_component(r, ref->col, &mvctx->comps[1]);
-  }
-}
-
-static void read_nmv_fp(vp9_reader *r, MV *mv, const MV *ref,
-                        const nmv_context *mvctx, int usehp) {
-  const MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
-  usehp = usehp && vp9_use_nmv_hp(ref);
-  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
-    mv->row = read_nmv_component_fp(r, mv->row, ref->row, &mvctx->comps[0],
-                                    usehp);
-  }
-  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
-    mv->col = read_nmv_component_fp(r, mv->col, ref->col, &mvctx->comps[1],
-                                    usehp);
-  }
-  /*
-  printf("MV: %d %d REF: %d %d\n", mv->row + ref->row, mv->col + ref->col,
-	 ref->row, ref->col);
-	 */
-}
-
-static void update_nmv(vp9_reader *bc, vp9_prob *const p,
+static void update_nmv(vp9_reader *r, vp9_prob *const p,
                        const vp9_prob upd_p) {
-  if (vp9_read(bc, upd_p)) {
+  if (vp9_read(r, upd_p)) {
 #ifdef LOW_PRECISION_MV_UPDATE
-    *p = (vp9_read_literal(bc, 7) << 1) | 1;
+    *p = (vp9_read_literal(r, 7) << 1) | 1;
 #else
-    *p = (vp9_read_literal(bc, 8));
+    *p = (vp9_read_literal(r, 8));
 #endif
   }
 }
 
-static void read_nmvprobs(vp9_reader *bc, nmv_context *mvctx,
+static void read_nmvprobs(vp9_reader *r, nmv_context *mvctx,
                           int usehp) {
   int i, j, k;
 
 #ifdef MV_GROUP_UPDATE
-  if (!vp9_read_bit(bc))
+  if (!vp9_read_bit(r))
     return;
 #endif
   for (j = 0; j < MV_JOINTS - 1; ++j)
-    update_nmv(bc, &mvctx->joints[j], VP9_NMV_UPDATE_PROB);
+    update_nmv(r, &mvctx->joints[j], VP9_NMV_UPDATE_PROB);
 
   for (i = 0; i < 2; ++i) {
-    update_nmv(bc, &mvctx->comps[i].sign, VP9_NMV_UPDATE_PROB);
+    update_nmv(r, &mvctx->comps[i].sign, VP9_NMV_UPDATE_PROB);
     for (j = 0; j < MV_CLASSES - 1; ++j)
-      update_nmv(bc, &mvctx->comps[i].classes[j], VP9_NMV_UPDATE_PROB);
+      update_nmv(r, &mvctx->comps[i].classes[j], VP9_NMV_UPDATE_PROB);
 
     for (j = 0; j < CLASS0_SIZE - 1; ++j)
-      update_nmv(bc, &mvctx->comps[i].class0[j], VP9_NMV_UPDATE_PROB);
+      update_nmv(r, &mvctx->comps[i].class0[j], VP9_NMV_UPDATE_PROB);
 
     for (j = 0; j < MV_OFFSET_BITS; ++j)
-      update_nmv(bc, &mvctx->comps[i].bits[j], VP9_NMV_UPDATE_PROB);
+      update_nmv(r, &mvctx->comps[i].bits[j], VP9_NMV_UPDATE_PROB);
   }
 
   for (i = 0; i < 2; ++i) {
-    for (j = 0; j < CLASS0_SIZE; ++j) {
+    for (j = 0; j < CLASS0_SIZE; ++j)
       for (k = 0; k < 3; ++k)
-        update_nmv(bc, &mvctx->comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB);
-    }
+        update_nmv(r, &mvctx->comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB);
 
     for (j = 0; j < 3; ++j)
-      update_nmv(bc, &mvctx->comps[i].fp[j], VP9_NMV_UPDATE_PROB);
+      update_nmv(r, &mvctx->comps[i].fp[j], VP9_NMV_UPDATE_PROB);
   }
 
   if (usehp) {
     for (i = 0; i < 2; ++i) {
-      update_nmv(bc, &mvctx->comps[i].class0_hp, VP9_NMV_UPDATE_PROB);
-      update_nmv(bc, &mvctx->comps[i].hp, VP9_NMV_UPDATE_PROB);
+      update_nmv(r, &mvctx->comps[i].class0_hp, VP9_NMV_UPDATE_PROB);
+      update_nmv(r, &mvctx->comps[i].hp, VP9_NMV_UPDATE_PROB);
     }
   }
 }
 
 // Read the referncence frame
-static MV_REFERENCE_FRAME read_ref_frame(VP9D_COMP *pbi,
-                                         vp9_reader *const bc,
-                                         unsigned char segment_id) {
-  MV_REFERENCE_FRAME ref_frame;
+static void read_ref_frame(VP9D_COMP *pbi, vp9_reader *r,
+                           int segment_id, MV_REFERENCE_FRAME ref_frame[2]) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
+  const int seg_ref_active = vp9_segfeature_active(xd, segment_id,
+                                                   SEG_LVL_REF_FRAME);
 
-  int seg_ref_count = 0;
-  int seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
+  // Segment reference frame features not available.
+  if (!seg_ref_active) {
+    int is_comp;
+    int comp_ctx = vp9_get_pred_context(cm, xd, PRED_COMP_INTER_INTER);
 
-  // If segment coding enabled does the segment allow for more than one
-  // possible reference frame
-  if (seg_ref_active) {
-    seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME) +
-                    vp9_check_segref(xd, segment_id, LAST_FRAME) +
-                    vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +
-                    vp9_check_segref(xd, segment_id, ALTREF_FRAME);
-  }
+    if (cm->comp_pred_mode == HYBRID_PREDICTION) {
+      is_comp = vp9_read(r, cm->fc.comp_inter_prob[comp_ctx]);
+      cm->fc.comp_inter_count[comp_ctx][is_comp]++;
+    } else {
+      is_comp = cm->comp_pred_mode == COMP_PREDICTION_ONLY;
+    }
 
-  // Segment reference frame features not available or allows for
-  // multiple reference frame options
-  if (!seg_ref_active || (seg_ref_count > 1)) {
-    // Values used in prediction model coding
-    MV_REFERENCE_FRAME pred_ref;
+    // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
+    if (is_comp) {
+      int b, fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+      int ref_ctx = vp9_get_pred_context(cm, xd, PRED_COMP_REF_P);
 
-    // Get the context probability the prediction flag
-    vp9_prob pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);
-
-    // Read the prediction status flag
-    unsigned char prediction_flag = vp9_read(bc, pred_prob);
-
-    // Store the prediction flag.
-    vp9_set_pred_flag(xd, PRED_REF, prediction_flag);
-
-    // Get the predicted reference frame.
-    pred_ref = vp9_get_pred_ref(cm, xd);
-
-    // If correctly predicted then use the predicted value
-    if (prediction_flag) {
-      ref_frame = pred_ref;
+      ref_frame[fix_ref_idx]  = cm->comp_fixed_ref;
+      b = vp9_read(r, cm->fc.comp_ref_prob[ref_ctx]);
+      cm->fc.comp_ref_count[ref_ctx][b]++;
+      ref_frame[!fix_ref_idx] = cm->comp_var_ref[b];
     } else {
-      // decode the explicitly coded value
-      vp9_prob mod_refprobs[PREDICTION_PROBS];
-      vpx_memcpy(mod_refprobs,
-                 cm->mod_refprobs[pred_ref], sizeof(mod_refprobs));
-
-      // If segment coding enabled blank out options that cant occur by
-      // setting the branch probability to 0.
-      if (seg_ref_active) {
-        mod_refprobs[INTRA_FRAME] *=
-          vp9_check_segref(xd, segment_id, INTRA_FRAME);
-        mod_refprobs[LAST_FRAME] *=
-          vp9_check_segref(xd, segment_id, LAST_FRAME);
-        mod_refprobs[GOLDEN_FRAME] *=
-          (vp9_check_segref(xd, segment_id, GOLDEN_FRAME) *
-           vp9_check_segref(xd, segment_id, ALTREF_FRAME));
+      int ref1_ctx = vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P1);
+      ref_frame[1] = NONE;
+      if (vp9_read(r, cm->fc.single_ref_prob[ref1_ctx][0])) {
+        int ref2_ctx = vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P2);
+        int b2 = vp9_read(r, cm->fc.single_ref_prob[ref2_ctx][1]);
+        ref_frame[0] = b2 ? ALTREF_FRAME : GOLDEN_FRAME;
+        cm->fc.single_ref_count[ref1_ctx][0][1]++;
+        cm->fc.single_ref_count[ref2_ctx][1][b2]++;
+      } else {
+        ref_frame[0] = LAST_FRAME;
+        cm->fc.single_ref_count[ref1_ctx][0][0]++;
       }
-
-      // Default to INTRA_FRAME (value 0)
-      ref_frame = INTRA_FRAME;
-
-      // Do we need to decode the Intra/Inter branch
-      if (mod_refprobs[0])
-        ref_frame = (MV_REFERENCE_FRAME) vp9_read(bc, mod_refprobs[0]);
-      else
-        ref_frame++;
-
-      if (ref_frame) {
-        // Do we need to decode the Last/Gf_Arf branch
-        if (mod_refprobs[1])
-          ref_frame += vp9_read(bc, mod_refprobs[1]);
-        else
-          ref_frame++;
-
-        if (ref_frame > 1) {
-          // Do we need to decode the GF/Arf branch
-          if (mod_refprobs[2])
-            ref_frame += vp9_read(bc, mod_refprobs[2]);
-          else {
-            if (seg_ref_active) {
-              if ((pred_ref == GOLDEN_FRAME) ||
-                  !vp9_check_segref(xd, segment_id, GOLDEN_FRAME)) {
-                ref_frame = ALTREF_FRAME;
-              } else
-                ref_frame = GOLDEN_FRAME;
-            } else
-              ref_frame = (pred_ref == GOLDEN_FRAME)
-                          ? ALTREF_FRAME : GOLDEN_FRAME;
-          }
-        }
-      }
     }
   } else {
-    // Segment reference frame features are enabled
-    // The reference frame for the mb is considered as correclty predicted
-    // if it is signaled at the segment level for the purposes of the
-    // common prediction model
-    vp9_set_pred_flag(xd, PRED_REF, 1);
-    ref_frame = vp9_get_pred_ref(cm, xd);
+    ref_frame[0] = vp9_get_segdata(xd, segment_id, SEG_LVL_REF_FRAME);
+    ref_frame[1] = NONE;
   }
-
-  return (MV_REFERENCE_FRAME)ref_frame;
 }
 
-static MB_PREDICTION_MODE read_sb_mv_ref(vp9_reader *bc, const vp9_prob *p) {
-  return (MB_PREDICTION_MODE) treed_read(bc, vp9_sb_mv_ref_tree, p);
+static MB_PREDICTION_MODE read_sb_mv_ref(vp9_reader *r, const vp9_prob *p) {
+  return (MB_PREDICTION_MODE) treed_read(r, vp9_sb_mv_ref_tree, p);
 }
 
-static MB_PREDICTION_MODE read_mv_ref(vp9_reader *bc, const vp9_prob *p) {
-  return (MB_PREDICTION_MODE) treed_read(bc, vp9_mv_ref_tree, p);
-}
-
-static B_PREDICTION_MODE sub_mv_ref(vp9_reader *bc, const vp9_prob *p) {
-  return (B_PREDICTION_MODE) treed_read(bc, vp9_sub_mv_ref_tree, p);
-}
-
 #ifdef VPX_MODE_COUNT
 unsigned int vp9_mv_cont_count[5][4] = {
   { 0, 0, 0, 0 },
@@ -476,79 +303,103 @@
 };
 #endif
 
-static const unsigned char mbsplit_fill_count[4] = { 8, 8, 4, 1 };
-static const unsigned char mbsplit_fill_offset[4][16] = {
-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 },
-  { 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,   6,  7, 10, 11, 14, 15 },
-  { 0,  1,  4,  5,  2,  3,  6,  7,  8,  9,  12, 13, 10, 11, 14, 15 },
-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 }
-};
-
-static void read_switchable_interp_probs(VP9D_COMP* const pbi,
-                                         BOOL_DECODER* const bc) {
-  VP9_COMMON *const cm = &pbi->common;
+static void read_switchable_interp_probs(VP9_COMMON* const cm, vp9_reader *r) {
   int i, j;
-  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
+  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j)
     for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
-      cm->fc.switchable_interp_prob[j][i] = vp9_read_prob(bc);
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
+        cm->fc.switchable_interp_prob[j][i] =
+            // vp9_read_prob(r);
+            vp9_read_prob_diff_update(r, cm->fc.switchable_interp_prob[j][i]);
+      }
     }
-  }
-  //printf("DECODER: %d %d\n", cm->fc.switchable_interp_prob[0],
-  //cm->fc.switchable_interp_prob[1]);
 }
 
-static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *bc) {
+static void read_inter_mode_probs(VP9_COMMON *const cm, vp9_reader *r) {
+  int i, j;
+  for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+    for (j = 0; j < VP9_INTER_MODES - 1; ++j) {
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
+        // cm->fc.inter_mode_probs[i][j] = vp9_read_prob(r);
+        cm->fc.inter_mode_probs[i][j] =
+            vp9_read_prob_diff_update(r, cm->fc.inter_mode_probs[i][j]);
+      }
+    }
+}
+
+static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
+  COMPPREDMODE_TYPE mode = vp9_read_bit(r);
+  if (mode)
+     mode += vp9_read_bit(r);
+  return mode;
+}
+
+static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
-  nmv_context *const nmvc = &pbi->common.fc.nmvc;
-  MACROBLOCKD *const xd  = &pbi->mb;
 
-  if (cm->frame_type == KEY_FRAME) {
-    if (!cm->kf_ymode_probs_update)
-      cm->kf_ymode_probs_index = vp9_read_literal(bc, 3);
-  } else {
-    if (cm->mcomp_filter_type == SWITCHABLE)
-      read_switchable_interp_probs(pbi, bc);
-#if CONFIG_COMP_INTERINTRA_PRED
-    if (cm->use_interintra) {
-      if (vp9_read(bc, VP9_UPD_INTERINTRA_PROB))
-        cm->fc.interintra_prob = vp9_read_prob(bc);
-    }
-#endif
-    // Decode the baseline probabilities for decoding reference frame
-    cm->prob_intra_coded = vp9_read_prob(bc);
-    cm->prob_last_coded  = vp9_read_prob(bc);
-    cm->prob_gf_coded    = vp9_read_prob(bc);
+  if ((cm->frame_type != KEY_FRAME) && (!cm->intra_only)) {
+    nmv_context *const nmvc = &pbi->common.fc.nmvc;
+    MACROBLOCKD *const xd = &pbi->mb;
+    int i, j;
 
-    // Computes a modified set of probabilities for use when reference
-    // frame prediction fails.
-    vp9_compute_mod_refprobs(cm);
+    read_inter_mode_probs(cm, r);
 
-    pbi->common.comp_pred_mode = vp9_read(bc, 128);
-    if (cm->comp_pred_mode)
-      cm->comp_pred_mode += vp9_read(bc, 128);
-    if (cm->comp_pred_mode == HYBRID_PREDICTION) {
-      int i;
-      for (i = 0; i < COMP_PRED_CONTEXTS; i++)
-        cm->prob_comppred[i] = vp9_read_prob(bc);
+    if (cm->mcomp_filter_type == SWITCHABLE)
+      read_switchable_interp_probs(cm, r);
+
+    for (i = 0; i < INTRA_INTER_CONTEXTS; i++) {
+      if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+        cm->fc.intra_inter_prob[i] =
+            vp9_read_prob_diff_update(r, cm->fc.intra_inter_prob[i]);
     }
 
-    if (vp9_read_bit(bc)) {
-      int i = 0;
-
-      do {
-        cm->fc.ymode_prob[i] = vp9_read_prob(bc);
-      } while (++i < VP9_YMODES - 1);
+    if (cm->allow_comp_inter_inter) {
+      cm->comp_pred_mode = read_comp_pred_mode(r);
+      if (cm->comp_pred_mode == HYBRID_PREDICTION)
+        for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+          if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+            cm->fc.comp_inter_prob[i] =
+                vp9_read_prob_diff_update(r, cm->fc.comp_inter_prob[i]);
+    } else {
+      cm->comp_pred_mode = SINGLE_PREDICTION_ONLY;
     }
 
-    if (vp9_read_bit(bc)) {
-      int i = 0;
+    if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
+      for (i = 0; i < REF_CONTEXTS; i++) {
+        if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+          cm->fc.single_ref_prob[i][0] =
+              vp9_read_prob_diff_update(r, cm->fc.single_ref_prob[i][0]);
+        if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+          cm->fc.single_ref_prob[i][1] =
+              vp9_read_prob_diff_update(r, cm->fc.single_ref_prob[i][1]);
+      }
 
-      do {
-        cm->fc.sb_ymode_prob[i] = vp9_read_prob(bc);
-      } while (++i < VP9_I32X32_MODES - 1);
+    if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
+      for (i = 0; i < REF_CONTEXTS; i++)
+        if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+          cm->fc.comp_ref_prob[i] =
+              vp9_read_prob_diff_update(r, cm->fc.comp_ref_prob[i]);
+
+    // VP9_INTRA_MODES
+    for (j = 0; j < BLOCK_SIZE_GROUPS; j++) {
+      for (i = 0; i < VP9_INTRA_MODES - 1; ++i) {
+        if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
+          cm->fc.y_mode_prob[j][i] =
+              vp9_read_prob_diff_update(r, cm->fc.y_mode_prob[j][i]);
+        }
+      }
     }
+    for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j) {
+      for (i = 0; i < PARTITION_TYPES - 1; ++i) {
+        if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
+          cm->fc.partition_prob[INTER_FRAME][j][i] =
+              vp9_read_prob_diff_update(r,
+                  cm->fc.partition_prob[INTER_FRAME][j][i]);
+        }
+      }
+    }
 
-    read_nmvprobs(bc, nmvc, xd->allow_high_precision_mv);
+    read_nmvprobs(r, nmvc, xd->allow_high_precision_mv);
   }
 }
 
@@ -555,80 +406,40 @@
 // This function either reads the segment id for the current macroblock from
 // the bitstream or if the value is temporally predicted asserts the predicted
 // value
-static void read_mb_segment_id(VP9D_COMP *pbi,
-                               int mb_row, int mb_col,
-                               BOOL_DECODER* const bc) {
+static int read_mb_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col,
+                              vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  MODE_INFO *mi = xd->mode_info_context;
-  MB_MODE_INFO *mbmi = &mi->mbmi;
-  int mb_index = mb_row * pbi->common.mb_cols + mb_col;
+  MODE_INFO *const mi = xd->mode_info_context;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
 
-  if (xd->segmentation_enabled) {
-    if (xd->update_mb_segmentation_map) {
-      // Is temporal coding of the segment id for this mb enabled.
-      if (cm->temporal_update) {
-        // Get the context based probability for reading the
-        // prediction status flag
-        vp9_prob pred_prob = vp9_get_pred_prob(cm, xd, PRED_SEG_ID);
+  if (!xd->segmentation_enabled)
+    return 0;  // Default for disabled segmentation
 
-        // Read the prediction status flag
-        unsigned char seg_pred_flag = vp9_read(bc, pred_prob);
+  if (xd->update_mb_segmentation_map) {
+    int segment_id;
 
-        // Store the prediction flag.
-        vp9_set_pred_flag(xd, PRED_SEG_ID, seg_pred_flag);
+    if (cm->temporal_update) {
+      // Temporal coding of the segment id for this mb is enabled.
+      // Get the context based probability for reading the
+      // prediction status flag
+      const vp9_prob pred_prob = vp9_get_pred_prob(cm, xd, PRED_SEG_ID);
+      const int pred_flag = vp9_read(r, pred_prob);
+      vp9_set_pred_flag(xd, PRED_SEG_ID, pred_flag);
 
-        // If the value is flagged as correctly predicted
-        // then use the predicted value
-        if (seg_pred_flag) {
-          mbmi->segment_id = vp9_get_pred_mb_segid(cm, xd, mb_index);
-        } else {
-          // Decode it explicitly
-          read_mb_segid_except(cm, bc, mbmi, xd, mb_row, mb_col);
-        }
-      } else {
-        // Normal unpredicted coding mode
-        read_mb_segid(bc, mbmi, xd);
-      }
-
-      if (mbmi->sb_type) {
-        const int nmbs = 1 << mbmi->sb_type;
-        const int ymbs = MIN(cm->mb_rows - mb_row, nmbs);
-        const int xmbs = MIN(cm->mb_cols - mb_col, nmbs);
-        int x, y;
-
-        for (y = 0; y < ymbs; y++) {
-          for (x = 0; x < xmbs; x++) {
-            cm->last_frame_seg_map[mb_index + x + y * cm->mb_cols] =
-                mbmi->segment_id;
-          }
-        }
-      } else {
-        cm->last_frame_seg_map[mb_index] = mbmi->segment_id;
-      }
+      // If the value is flagged as correctly predicted
+      // then use the predicted value, otherwise decode it explicitly
+      segment_id = pred_flag ? vp9_get_pred_mi_segid(cm, mbmi->sb_type,
+                                                     mi_row, mi_col)
+                             : read_mb_segid(r, xd);
     } else {
-      if (mbmi->sb_type) {
-        const int nmbs = 1 << mbmi->sb_type;
-        const int ymbs = MIN(cm->mb_rows - mb_row, nmbs);
-        const int xmbs = MIN(cm->mb_cols - mb_col, nmbs);
-        unsigned segment_id = -1;
-        int x, y;
-
-        for (y = 0; y < ymbs; y++) {
-          for (x = 0; x < xmbs; x++) {
-            segment_id = MIN(segment_id,
-                cm->last_frame_seg_map[mb_index + x + y * cm->mb_cols]);
-          }
-        }
-        mbmi->segment_id = segment_id;
-      } else {
-        mbmi->segment_id = cm->last_frame_seg_map[mb_index];
-      }
+      segment_id = read_mb_segid(r, xd);  // Normal unpredicted coding mode
     }
+
+    set_segment_id(cm, mbmi, mi_row, mi_col, segment_id);  // Side effect
+    return segment_id;
   } else {
-    // The encoder explicitly sets the segment_id to 0
-    // when segmentation is disabled
-    mbmi->segment_id = 0;
+    return vp9_get_pred_mi_segid(cm, mbmi->sb_type, mi_row, mi_col);
   }
 }
 
@@ -643,48 +454,66 @@
            mb_to_bottom_edge);
 }
 
-static INLINE void process_mv(BOOL_DECODER* bc, MV *mv, MV *ref,
-                              nmv_context *nmvc, nmv_context_counts *mvctx,
-                              int usehp) {
-  read_nmv(bc, mv, ref, nmvc);
-  read_nmv_fp(bc, mv, ref, nmvc, usehp);
-  vp9_increment_nmv(mv, ref, mvctx, usehp);
-  mv->row += ref->row;
-  mv->col += ref->col;
+static INLINE void decode_mv(vp9_reader *r, MV *mv, const MV *ref,
+                             const nmv_context *ctx,
+                             nmv_context_counts *counts,
+                             int usehp) {
+  const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, ctx->joints);
+  MV diff = {0, 0};
+
+  usehp = usehp && vp9_use_nmv_hp(ref);
+  if (mv_joint_vertical(j))
+    diff.row = read_mv_component(r, &ctx->comps[0], usehp);
+
+  if (mv_joint_horizontal(j))
+    diff.col = read_mv_component(r, &ctx->comps[1], usehp);
+
+  vp9_increment_nmv(&diff, ref, counts, usehp);
+
+  mv->row = diff.row + ref->row;
+  mv->col = diff.col + ref->col;
 }
 
+static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type(
+    VP9D_COMP *pbi, vp9_reader *r) {
+  const int index = treed_read(r, vp9_switchable_interp_tree,
+                               vp9_get_pred_probs(&pbi->common, &pbi->mb,
+                                                  PRED_SWITCHABLE_INTERP));
+  ++pbi->common.fc.switchable_interp_count
+                [vp9_get_pred_context(
+                    &pbi->common, &pbi->mb, PRED_SWITCHABLE_INTERP)][index];
+  return vp9_switchable_interp[index];
+}
+
 static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
-                             MODE_INFO *prev_mi,
-                             int mb_row, int mb_col,
-                             BOOL_DECODER* const bc) {
+                             int mi_row, int mi_col,
+                             vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
-  nmv_context *const nmvc = &pbi->common.fc.nmvc;
-  const int mis = pbi->common.mode_info_stride;
+  nmv_context *const nmvc = &cm->fc.nmvc;
   MACROBLOCKD *const xd = &pbi->mb;
 
-  int_mv *const mv = &mbmi->mv[0];
-  const int mb_size = 1 << mi->mbmi.sb_type;
+  int_mv *const mv0 = &mbmi->mv[0];
+  int_mv *const mv1 = &mbmi->mv[1];
+  BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+  int bw = 1 << b_width_log2(bsize);
+  int bh = 1 << b_height_log2(bsize);
 
-  const int use_prev_in_find_mv_refs = cm->width == cm->last_width &&
-                                       cm->height == cm->last_height &&
-                                       !cm->error_resilient_mode;
-
   int mb_to_left_edge, mb_to_right_edge, mb_to_top_edge, mb_to_bottom_edge;
+  int j, idx, idy;
 
   mbmi->need_to_clamp_mvs = 0;
   mbmi->need_to_clamp_secondmv = 0;
-  mbmi->second_ref_frame = NONE;
+  mbmi->ref_frame[1] = NONE;
 
   // Make sure the MACROBLOCKD mode info pointer is pointed at the
   // correct entry for the current macroblock.
   xd->mode_info_context = mi;
-  xd->prev_mode_info_context = prev_mi;
 
   // Distance of Mb to the various image edges.
   // These specified to 8th pel as they are always compared to MV values
   // that are in 1/8th pel units
-  set_mb_row(cm, xd, mb_row, mb_size);
-  set_mb_col(cm, xd, mb_col, mb_size);
+  set_mi_row_col(cm, xd, mi_row, 1 << mi_height_log2(bsize),
+                         mi_col, 1 << mi_width_log2(bsize));
 
   mb_to_top_edge = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
   mb_to_bottom_edge = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
@@ -692,81 +521,78 @@
   mb_to_right_edge = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
 
   // Read the macroblock segment id.
-  read_mb_segment_id(pbi, mb_row, mb_col, bc);
+  mbmi->segment_id = read_mb_segment_id(pbi, mi_row, mi_col, r);
 
-  if (pbi->common.mb_no_coeff_skip &&
-      (!vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP))) {
-    // Read the macroblock coeff skip flag if this feature is in use,
-    // else default to 0
-    mbmi->mb_skip_coeff = vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
-  } else {
-    mbmi->mb_skip_coeff = vp9_segfeature_active(xd, mbmi->segment_id,
-                                                SEG_LVL_SKIP);
+  mbmi->mb_skip_coeff = vp9_segfeature_active(xd, mbmi->segment_id,
+                                              SEG_LVL_SKIP);
+  if (!mbmi->mb_skip_coeff) {
+    mbmi->mb_skip_coeff = vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
+    cm->fc.mbskip_count[vp9_get_pred_context(cm, xd, PRED_MBSKIP)]
+                       [mbmi->mb_skip_coeff]++;
   }
 
   // Read the reference frame
-  mbmi->ref_frame = read_ref_frame(pbi, bc, mbmi->segment_id);
+  if (!vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_REF_FRAME)) {
+    mbmi->ref_frame[0] =
+        vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_INTRA_INTER));
+    cm->fc.intra_inter_count[vp9_get_pred_context(cm, xd, PRED_INTRA_INTER)]
+                            [mbmi->ref_frame[0] != INTRA_FRAME]++;
+  } else {
+    mbmi->ref_frame[0] =
+        vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
+  }
 
-  /*
-  if (pbi->common.current_video_frame == 1)
-    printf("ref frame: %d [%d %d]\n", mbmi->ref_frame, mb_row, mb_col);
-    */
+  if (cm->txfm_mode == TX_MODE_SELECT &&
+      (mbmi->mb_skip_coeff == 0 || mbmi->ref_frame[0] == INTRA_FRAME) &&
+      bsize >= BLOCK_SIZE_SB8X8) {
+    mbmi->txfm_size = select_txfm_size(cm, xd, r, bsize);
+  } else if (bsize >= BLOCK_SIZE_SB32X32 &&
+             cm->txfm_mode >= ALLOW_32X32) {
+    mbmi->txfm_size = TX_32X32;
+  } else if (cm->txfm_mode >= ALLOW_16X16 &&
+             bsize >= BLOCK_SIZE_MB16X16) {
+    mbmi->txfm_size = TX_16X16;
+  } else if (cm->txfm_mode >= ALLOW_8X8 && (bsize >= BLOCK_SIZE_SB8X8)) {
+    mbmi->txfm_size = TX_8X8;
+  } else {
+    mbmi->txfm_size = TX_4X4;
+  }
 
   // If reference frame is an Inter frame
-  if (mbmi->ref_frame) {
+  if (mbmi->ref_frame[0] != INTRA_FRAME) {
     int_mv nearest, nearby, best_mv;
     int_mv nearest_second, nearby_second, best_mv_second;
-    vp9_prob mv_ref_p[VP9_MVREFS - 1];
+    vp9_prob mv_ref_p[VP9_INTER_MODES - 1];
 
-    MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;
-    xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame - 1];
+    read_ref_frame(pbi, r, mbmi->segment_id, mbmi->ref_frame);
 
     {
-      const int use_prev_in_find_best_ref =
-          xd->scale_factor[0].x_num == xd->scale_factor[0].x_den &&
-          xd->scale_factor[0].y_num == xd->scale_factor[0].y_den &&
-          !cm->error_resilient_mode &&
-          !cm->frame_parallel_decoding_mode;
-
-      /* Select the appropriate reference frame for this MB */
-      const int ref_fb_idx = cm->active_ref_idx[ref_frame - 1];
-
-      setup_pred_block(&xd->pre, &cm->yv12_fb[ref_fb_idx],
-          mb_row, mb_col, &xd->scale_factor[0], &xd->scale_factor_uv[0]);
-
 #ifdef DEC_DEBUG
       if (dec_debug)
         printf("%d %d\n", xd->mode_info_context->mbmi.mv[0].as_mv.row,
                xd->mode_info_context->mbmi.mv[0].as_mv.col);
 #endif
-      // if (cm->current_video_frame == 1 && mb_row == 4 && mb_col == 5)
-      //  printf("Dello\n");
-      vp9_find_mv_refs(cm, xd, mi, use_prev_in_find_mv_refs ? prev_mi : NULL,
-                       ref_frame, mbmi->ref_mvs[ref_frame],
+      vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
+                       mbmi->ref_frame[0], mbmi->ref_mvs[mbmi->ref_frame[0]],
                        cm->ref_frame_sign_bias);
 
-      vp9_mv_ref_probs(&pbi->common, mv_ref_p,
-                       mbmi->mb_mode_context[ref_frame]);
+      vp9_mv_ref_probs(cm, mv_ref_p, mbmi->mb_mode_context[mbmi->ref_frame[0]]);
 
       // If the segment level skip mode enabled
       if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {
         mbmi->mode = ZEROMV;
-      } else {
-        mbmi->mode = mbmi->sb_type ? read_sb_mv_ref(bc, mv_ref_p)
-                                   : read_mv_ref(bc, mv_ref_p);
-        vp9_accum_mv_refs(&pbi->common, mbmi->mode,
-                          mbmi->mb_mode_context[ref_frame]);
+      } else if (bsize >= BLOCK_SIZE_SB8X8) {
+        mbmi->mode = read_sb_mv_ref(r, mv_ref_p);
+        vp9_accum_mv_refs(cm, mbmi->mode,
+                          mbmi->mb_mode_context[mbmi->ref_frame[0]]);
       }
 
-      if (mbmi->mode != ZEROMV) {
+      if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) {
         vp9_find_best_ref_mvs(xd,
-                              use_prev_in_find_best_ref ?
-                                  xd->pre.y_buffer : NULL,
-                              xd->pre.y_stride,
-                              mbmi->ref_mvs[ref_frame],
+                              mbmi->ref_mvs[mbmi->ref_frame[0]],
                               &nearest, &nearby);
 
-        best_mv.as_int = (mbmi->ref_mvs[ref_frame][0]).as_int;
+        best_mv.as_int = mbmi->ref_mvs[mbmi->ref_frame[0]][0].as_int;
       }
 
 #ifdef DEC_DEBUG
@@ -777,176 +603,79 @@
 #endif
     }
 
-    if (mbmi->mode >= NEARESTMV && mbmi->mode <= SPLITMV) {
-      if (cm->mcomp_filter_type == SWITCHABLE) {
-        mbmi->interp_filter = vp9_switchable_interp[
-            treed_read(bc, vp9_switchable_interp_tree,
-                       vp9_get_pred_probs(cm, xd, PRED_SWITCHABLE_INTERP))];
-      } else {
-        mbmi->interp_filter = cm->mcomp_filter_type;
-      }
-    }
+    mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE
+                              ? read_switchable_filter_type(pbi, r)
+                              : cm->mcomp_filter_type;
 
-    if (cm->comp_pred_mode == COMP_PREDICTION_ONLY ||
-        (cm->comp_pred_mode == HYBRID_PREDICTION &&
-         vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_COMP)))) {
-      /* Since we have 3 reference frames, we can only have 3 unique
-       * combinations of combinations of 2 different reference frames
-       * (A-G, G-L or A-L). In the bitstream, we use this to simply
-       * derive the second reference frame from the first reference
-       * frame, by saying it's the next one in the enumerator, and
-       * if that's > n_refs, then the second reference frame is the
-       * first one in the enumerator. */
-      mbmi->second_ref_frame = mbmi->ref_frame + 1;
-      if (mbmi->second_ref_frame == 4)
-        mbmi->second_ref_frame = 1;
-      if (mbmi->second_ref_frame > 0) {
-        int second_ref_fb_idx;
-        int use_prev_in_find_best_ref;
+    if (mbmi->ref_frame[1] > INTRA_FRAME) {
+      vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
+                       mbmi->ref_frame[1],
+                       mbmi->ref_mvs[mbmi->ref_frame[1]],
+                       cm->ref_frame_sign_bias);
 
-        xd->scale_factor[1] = cm->active_ref_scale[mbmi->second_ref_frame - 1];
-        use_prev_in_find_best_ref =
-            xd->scale_factor[1].x_num == xd->scale_factor[1].x_den &&
-            xd->scale_factor[1].y_num == xd->scale_factor[1].y_den &&
-            !cm->error_resilient_mode &&
-            !cm->frame_parallel_decoding_mode;
-
-        /* Select the appropriate reference frame for this MB */
-        second_ref_fb_idx = cm->active_ref_idx[mbmi->second_ref_frame - 1];
-
-        setup_pred_block(&xd->second_pre, &cm->yv12_fb[second_ref_fb_idx],
-             mb_row, mb_col, &xd->scale_factor[1], &xd->scale_factor_uv[1]);
-
-        vp9_find_mv_refs(cm, xd, mi, use_prev_in_find_mv_refs ? prev_mi : NULL,
-                         mbmi->second_ref_frame,
-                         mbmi->ref_mvs[mbmi->second_ref_frame],
-                         cm->ref_frame_sign_bias);
-
-        if (mbmi->mode != ZEROMV) {
-          vp9_find_best_ref_mvs(xd,
-                                use_prev_in_find_best_ref ?
-                                    xd->second_pre.y_buffer : NULL,
-                                xd->second_pre.y_stride,
-                                mbmi->ref_mvs[mbmi->second_ref_frame],
-                                &nearest_second,
-                                &nearby_second);
-          best_mv_second = mbmi->ref_mvs[mbmi->second_ref_frame][0];
-        }
+      if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) {
+        vp9_find_best_ref_mvs(xd,
+                              mbmi->ref_mvs[mbmi->ref_frame[1]],
+                              &nearest_second,
+                              &nearby_second);
+        best_mv_second.as_int = mbmi->ref_mvs[mbmi->ref_frame[1]][0].as_int;
       }
-
-    } else {
-#if CONFIG_COMP_INTERINTRA_PRED
-      if (pbi->common.use_interintra &&
-          mbmi->mode >= NEARESTMV && mbmi->mode < SPLITMV &&
-          mbmi->second_ref_frame == NONE) {
-        mbmi->second_ref_frame = (vp9_read(bc, pbi->common.fc.interintra_prob) ?
-                                  INTRA_FRAME : NONE);
-        // printf("-- %d (%d)\n", mbmi->second_ref_frame == INTRA_FRAME,
-        //        pbi->common.fc.interintra_prob);
-        pbi->common.fc.interintra_counts[
-            mbmi->second_ref_frame == INTRA_FRAME]++;
-        if (mbmi->second_ref_frame == INTRA_FRAME) {
-          mbmi->interintra_mode = read_ymode(bc, pbi->common.fc.ymode_prob);
-          pbi->common.fc.ymode_counts[mbmi->interintra_mode]++;
-#if SEPARATE_INTERINTRA_UV
-          mbmi->interintra_uv_mode = read_uv_mode(bc,
-              pbi->common.fc.uv_mode_prob[mbmi->interintra_mode]);
-          pbi->common.fc.uv_mode_counts[mbmi->interintra_mode]
-                                       [mbmi->interintra_uv_mode]++;
-#else
-          mbmi->interintra_uv_mode = mbmi->interintra_mode;
-#endif
-          // printf("** %d %d\n",
-          //        mbmi->interintra_mode, mbmi->interintra_uv_mode);
-        }
-      }
-#endif
     }
 
-#if CONFIG_NEW_MVREF
-    // if ((mbmi->mode == NEWMV) || (mbmi->mode == SPLITMV))
-    if (mbmi->mode == NEWMV) {
-      int best_index;
-      MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;
-
-      // Encode the index of the choice.
-      best_index =
-        vp9_read_mv_ref_id(bc, xd->mb_mv_ref_probs[ref_frame]);
-
-      best_mv.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;
-
-      if (mbmi->second_ref_frame > 0) {
-        ref_frame = mbmi->second_ref_frame;
-
-        // Encode the index of the choice.
-        best_index =
-          vp9_read_mv_ref_id(bc, xd->mb_mv_ref_probs[ref_frame]);
-        best_mv_second.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;
-      }
-    }
-#endif
-
     mbmi->uv_mode = DC_PRED;
-    switch (mbmi->mode) {
-      case SPLITMV: {
-        const int s = treed_read(bc, vp9_mbsplit_tree, cm->fc.mbsplit_prob);
-        const int num_p = vp9_mbsplit_count[s];
-        int j = 0;
-
-        cm->fc.mbsplit_counts[s]++;
-        mbmi->need_to_clamp_mvs = 0;
-        mbmi->partitioning = s;
-        do {  // for each subset j
-          int_mv leftmv, abovemv, second_leftmv, second_abovemv;
+    if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
+      mbmi->need_to_clamp_mvs = 0;
+      for (idy = 0; idy < 2; idy += bh) {
+        for (idx = 0; idx < 2; idx += bw) {
           int_mv blockmv, secondmv;
-          int mv_contz;
           int blockmode;
-          int k = vp9_mbsplit_offset[s][j];  // first block in subset j
+          int i;
+          j = idy * 2 + idx;
 
-          leftmv.as_int = left_block_mv(xd, mi, k);
-          abovemv.as_int = above_block_mv(mi, k, mis);
-          second_leftmv.as_int = 0;
-          second_abovemv.as_int = 0;
-          if (mbmi->second_ref_frame > 0) {
-            second_leftmv.as_int = left_block_second_mv(xd, mi, k);
-            second_abovemv.as_int = above_block_second_mv(mi, k, mis);
+          blockmode = read_sb_mv_ref(r, mv_ref_p);
+          vp9_accum_mv_refs(cm, blockmode,
+                            mbmi->mb_mode_context[mbmi->ref_frame[0]]);
+          if (blockmode == NEARESTMV || blockmode == NEARMV) {
+            MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1];
+            vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest, &nearby, j, 0);
+            if (rf2 > 0) {
+              vp9_append_sub8x8_mvs_for_idx(cm, xd,  &nearest_second,
+                                            &nearby_second, j, 1);
+            }
           }
-          mv_contz = vp9_mv_cont(&leftmv, &abovemv);
-          blockmode = sub_mv_ref(bc, cm->fc.sub_mv_ref_prob [mv_contz]);
-          cm->fc.sub_mv_ref_counts[mv_contz][blockmode - LEFT4X4]++;
 
           switch (blockmode) {
-            case NEW4X4:
-              process_mv(bc, &blockmv.as_mv, &best_mv.as_mv, nmvc,
+            case NEWMV:
+              decode_mv(r, &blockmv.as_mv, &best_mv.as_mv, nmvc,
                          &cm->fc.NMVcount, xd->allow_high_precision_mv);
 
-              if (mbmi->second_ref_frame > 0)
-                process_mv(bc, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,
-                           &cm->fc.NMVcount, xd->allow_high_precision_mv);
+              if (mbmi->ref_frame[1] > 0)
+                decode_mv(r, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,
+                          &cm->fc.NMVcount, xd->allow_high_precision_mv);
 
 #ifdef VPX_MODE_COUNT
               vp9_mv_cont_count[mv_contz][3]++;
 #endif
               break;
-            case LEFT4X4:
-              blockmv.as_int = leftmv.as_int;
-              if (mbmi->second_ref_frame > 0)
-                secondmv.as_int = second_leftmv.as_int;
+            case NEARESTMV:
+              blockmv.as_int = nearest.as_int;
+              if (mbmi->ref_frame[1] > 0)
+                secondmv.as_int = nearest_second.as_int;
 #ifdef VPX_MODE_COUNT
               vp9_mv_cont_count[mv_contz][0]++;
 #endif
               break;
-            case ABOVE4X4:
-              blockmv.as_int = abovemv.as_int;
-              if (mbmi->second_ref_frame > 0)
-                secondmv.as_int = second_abovemv.as_int;
+            case NEARMV:
+              blockmv.as_int = nearby.as_int;
+              if (mbmi->ref_frame[1] > 0)
+                secondmv.as_int = nearby_second.as_int;
 #ifdef VPX_MODE_COUNT
               vp9_mv_cont_count[mv_contz][1]++;
 #endif
               break;
-            case ZERO4X4:
+            case ZEROMV:
               blockmv.as_int = 0;
-              if (mbmi->second_ref_frame > 0)
+              if (mbmi->ref_frame[1] > 0)
                 secondmv.as_int = 0;
 #ifdef VPX_MODE_COUNT
               vp9_mv_cont_count[mv_contz][2]++;
@@ -955,490 +684,154 @@
             default:
               break;
           }
+          mi->bmi[j].as_mv[0].as_int = blockmv.as_int;
+          if (mbmi->ref_frame[1] > 0)
+            mi->bmi[j].as_mv[1].as_int = secondmv.as_int;
 
-          /*  Commenting this section out, not sure why this was needed, and
-           *  there are mismatches with this section in rare cases since it is
-           *  not done in the encoder at all.
-          mbmi->need_to_clamp_mvs |= check_mv_bounds(&blockmv,
-                                                     mb_to_left_edge,
-                                                     mb_to_right_edge,
-                                                     mb_to_top_edge,
-                                                     mb_to_bottom_edge);
-          if (mbmi->second_ref_frame > 0) {
-            mbmi->need_to_clamp_mvs |= check_mv_bounds(&secondmv,
-                                                       mb_to_left_edge,
-                                                       mb_to_right_edge,
-                                                       mb_to_top_edge,
-                                                       mb_to_bottom_edge);
-          }
-          */
-
-          {
-            /* Fill (uniform) modes, mvs of jth subset.
-             Must do it here because ensuing subsets can
-             refer back to us via "left" or "above". */
-            unsigned int fill_count = mbsplit_fill_count[s];
-            const unsigned char *fill_offset =
-                &mbsplit_fill_offset[s][j * fill_count];
-
-            do {
-              mi->bmi[*fill_offset].as_mv[0].as_int = blockmv.as_int;
-              if (mbmi->second_ref_frame > 0)
-                mi->bmi[*fill_offset].as_mv[1].as_int = secondmv.as_int;
-              fill_offset++;
-            } while (--fill_count);
-          }
-
-        } while (++j < num_p);
+          for (i = 1; i < bh; ++i)
+            vpx_memcpy(&mi->bmi[j + i * 2], &mi->bmi[j], sizeof(mi->bmi[j]));
+          for (i = 1; i < bw; ++i)
+            vpx_memcpy(&mi->bmi[j + i], &mi->bmi[j], sizeof(mi->bmi[j]));
+          mi->mbmi.mode = blockmode;
+        }
       }
 
-      mv->as_int = mi->bmi[15].as_mv[0].as_int;
-      mbmi->mv[1].as_int = mi->bmi[15].as_mv[1].as_int;
-
-      break;  /* done with SPLITMV */
-
-      case NEARMV:
-        // Clip "next_nearest" so that it does not extend to far out of image
-        assign_and_clamp_mv(mv, &nearby, mb_to_left_edge,
-                                         mb_to_right_edge,
-                                         mb_to_top_edge,
-                                         mb_to_bottom_edge);
-        if (mbmi->second_ref_frame > 0)
-          assign_and_clamp_mv(&mbmi->mv[1], &nearby_second, mb_to_left_edge,
-                                                            mb_to_right_edge,
-                                                            mb_to_top_edge,
-                                                            mb_to_bottom_edge);
-        break;
+      mv0->as_int = mi->bmi[3].as_mv[0].as_int;
+      mv1->as_int = mi->bmi[3].as_mv[1].as_int;
+    } else {
+      switch (mbmi->mode) {
+        case NEARMV:
+          // Clip "next_nearest" so that it does not extend to far out of image
+          assign_and_clamp_mv(mv0, &nearby, mb_to_left_edge,
+                                            mb_to_right_edge,
+                                            mb_to_top_edge,
+                                            mb_to_bottom_edge);
+          if (mbmi->ref_frame[1] > 0)
+            assign_and_clamp_mv(mv1, &nearby_second, mb_to_left_edge,
+                                                     mb_to_right_edge,
+                                                     mb_to_top_edge,
+                                                     mb_to_bottom_edge);
+          break;
 
-      case NEARESTMV:
-        // Clip "next_nearest" so that it does not extend to far out of image
-        assign_and_clamp_mv(mv, &nearest, mb_to_left_edge,
-                                          mb_to_right_edge,
-                                          mb_to_top_edge,
-                                          mb_to_bottom_edge);
-        if (mbmi->second_ref_frame > 0)
-          assign_and_clamp_mv(&mbmi->mv[1], &nearest_second, mb_to_left_edge,
-                                                             mb_to_right_edge,
-                                                             mb_to_top_edge,
-                                                             mb_to_bottom_edge);
-        break;
+        case NEARESTMV:
+          // Clip "next_nearest" so that it does not extend to far out of image
+          assign_and_clamp_mv(mv0, &nearest, mb_to_left_edge,
+                                             mb_to_right_edge,
+                                             mb_to_top_edge,
+                                             mb_to_bottom_edge);
+          if (mbmi->ref_frame[1] > 0)
+            assign_and_clamp_mv(mv1, &nearest_second, mb_to_left_edge,
+                                                      mb_to_right_edge,
+                                                      mb_to_top_edge,
+                                                      mb_to_bottom_edge);
+          break;
 
-      case ZEROMV:
-        mv->as_int = 0;
-        if (mbmi->second_ref_frame > 0)
-          mbmi->mv[1].as_int = 0;
-        break;
+        case ZEROMV:
+          mv0->as_int = 0;
+          if (mbmi->ref_frame[1] > 0)
+            mv1->as_int = 0;
+          break;
 
-      case NEWMV:
-        process_mv(bc, &mv->as_mv, &best_mv.as_mv, nmvc, &cm->fc.NMVcount,
-                   xd->allow_high_precision_mv);
+        case NEWMV:
+          decode_mv(r, &mv0->as_mv, &best_mv.as_mv, nmvc, &cm->fc.NMVcount,
+                    xd->allow_high_precision_mv);
+          mbmi->need_to_clamp_mvs = check_mv_bounds(mv0,
+                                                    mb_to_left_edge,
+                                                    mb_to_right_edge,
+                                                    mb_to_top_edge,
+                                                    mb_to_bottom_edge);
 
-        // Don't need to check this on NEARMV and NEARESTMV modes
-        // since those modes clamp the MV. The NEWMV mode does not,
-        // so signal to the prediction stage whether special
-        // handling may be required.
-        mbmi->need_to_clamp_mvs = check_mv_bounds(mv,
-                                                  mb_to_left_edge,
-                                                  mb_to_right_edge,
-                                                  mb_to_top_edge,
-                                                  mb_to_bottom_edge);
-
-        if (mbmi->second_ref_frame > 0) {
-          process_mv(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc,
-                     &cm->fc.NMVcount, xd->allow_high_precision_mv);
-          mbmi->need_to_clamp_secondmv |= check_mv_bounds(&mbmi->mv[1],
-                                                          mb_to_left_edge,
-                                                          mb_to_right_edge,
-                                                          mb_to_top_edge,
-                                                          mb_to_bottom_edge);
-        }
-        break;
-      default:
-;
+          if (mbmi->ref_frame[1] > 0) {
+            decode_mv(r, &mv1->as_mv, &best_mv_second.as_mv, nmvc,
+                      &cm->fc.NMVcount, xd->allow_high_precision_mv);
+            mbmi->need_to_clamp_secondmv = check_mv_bounds(mv1,
+                                                             mb_to_left_edge,
+                                                             mb_to_right_edge,
+                                                             mb_to_top_edge,
+                                                             mb_to_bottom_edge);
+          }
+          break;
+        default:
 #if CONFIG_DEBUG
-        assert(0);
+          assert(0);
 #endif
+          break;
+      }
     }
   } else {
-    /* required for left and above block mv */
-    mbmi->mv[0].as_int = 0;
+    // required for left and above block mv
+    mv0->as_int = 0;
 
-    if (mbmi->sb_type) {
-      mbmi->mode = read_sb_ymode(bc, pbi->common.fc.sb_ymode_prob);
-      pbi->common.fc.sb_ymode_counts[mbmi->mode]++;
+    if (bsize >= BLOCK_SIZE_SB8X8) {
+      const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+      const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
+      const int bsl = MIN(bwl, bhl);
+      mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[MIN(3, bsl)]);
+      cm->fc.y_mode_counts[MIN(3, bsl)][mbmi->mode]++;
     } else {
-      mbmi->mode = read_ymode(bc, pbi->common.fc.ymode_prob);
-      pbi->common.fc.ymode_counts[mbmi->mode]++;
-    }
-
-    // If MB mode is BPRED read the block modes
-    if (mbmi->mode == B_PRED) {
-      int j = 0;
-      do {
-        int m = read_bmode(bc, pbi->common.fc.bmode_prob);
-        mi->bmi[j].as_mode.first = m;
-#if CONFIG_NEWBINTRAMODES
-        if (m == B_CONTEXT_PRED) m -= CONTEXT_PRED_REPLACEMENTS;
-#endif
-        pbi->common.fc.bmode_counts[m]++;
-      } while (++j < 16);
-    }
-
-    if (mbmi->mode == I8X8_PRED) {
-      int i;
-      for (i = 0; i < 4; i++) {
-        const int ib = vp9_i8x8_block[i];
-        const int mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);
-
-        mi->bmi[ib + 0].as_mode.first = mode8x8;
-        mi->bmi[ib + 1].as_mode.first = mode8x8;
-        mi->bmi[ib + 4].as_mode.first = mode8x8;
-        mi->bmi[ib + 5].as_mode.first = mode8x8;
-        pbi->common.fc.i8x8_mode_counts[mode8x8]++;
+      int idx, idy;
+      for (idy = 0; idy < 2; idy += bh) {
+        for (idx = 0; idx < 2; idx += bw) {
+          int ib = idy * 2 + idx, k;
+          int m = read_intra_mode(r, cm->fc.y_mode_prob[0]);
+          mi->bmi[ib].as_mode.first = m;
+          cm->fc.y_mode_counts[0][m]++;
+          for (k = 1; k < bh; ++k)
+            mi->bmi[ib + k * 2].as_mode.first = m;
+          for (k = 1; k < bw; ++k)
+            mi->bmi[ib + k].as_mode.first = m;
+        }
       }
-    } else {
-      mbmi->uv_mode = read_uv_mode(bc, pbi->common.fc.uv_mode_prob[mbmi->mode]);
-      pbi->common.fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
+      mbmi->mode = mi->bmi[3].as_mode.first;
     }
-  }
-  /*
-  if (pbi->common.current_video_frame == 1)
-    printf("mode: %d skip: %d\n", mbmi->mode, mbmi->mb_skip_coeff);
-    */
 
-  if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
-      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= I8X8_PRED) ||
-       (mbmi->ref_frame != INTRA_FRAME && !(mbmi->mode == SPLITMV &&
-                           mbmi->partitioning == PARTITIONING_4X4)))) {
-    // FIXME(rbultje) code ternary symbol once all experiments are merged
-    mbmi->txfm_size = vp9_read(bc, cm->prob_tx[0]);
-    if (mbmi->txfm_size != TX_4X4 && mbmi->mode != I8X8_PRED &&
-        mbmi->mode != SPLITMV) {
-      mbmi->txfm_size += vp9_read(bc, cm->prob_tx[1]);
-      if (mbmi->sb_type && mbmi->txfm_size != TX_8X8)
-        mbmi->txfm_size += vp9_read(bc, cm->prob_tx[2]);
-    }
-  } else if (mbmi->sb_type && cm->txfm_mode >= ALLOW_32X32) {
-    mbmi->txfm_size = TX_32X32;
-  } else if (cm->txfm_mode >= ALLOW_16X16 &&
-      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||
-       (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
-    mbmi->txfm_size = TX_16X16;
-  } else if (cm->txfm_mode >= ALLOW_8X8 &&
-      (!(mbmi->ref_frame == INTRA_FRAME && mbmi->mode == B_PRED) &&
-       !(mbmi->ref_frame != INTRA_FRAME && mbmi->mode == SPLITMV &&
-         mbmi->partitioning == PARTITIONING_4X4))) {
-    mbmi->txfm_size = TX_8X8;
-  } else {
-    mbmi->txfm_size = TX_4X4;
+    mbmi->uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[mbmi->mode]);
+    cm->fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
   }
 }
 
-void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, BOOL_DECODER* const bc) {
+void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, vp9_reader *r) {
   VP9_COMMON *cm = &pbi->common;
+  int k;
 
-  vpx_memset(cm->mbskip_pred_probs, 0, sizeof(cm->mbskip_pred_probs));
-  if (pbi->common.mb_no_coeff_skip) {
-    int k;
-    for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
-      cm->mbskip_pred_probs[k] = vp9_read_prob(bc);
+  // TODO(jkoleszar): does this clear more than MBSKIP_CONTEXTS? Maybe remove.
+  // vpx_memset(cm->fc.mbskip_probs, 0, sizeof(cm->fc.mbskip_probs));
+  for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
+    if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
+      cm->fc.mbskip_probs[k] =
+          vp9_read_prob_diff_update(r, cm->fc.mbskip_probs[k]);
     }
+    // cm->fc.mbskip_probs[k] = vp9_read_prob(r);
   }
 
-  mb_mode_mv_init(pbi, bc);
+  mb_mode_mv_init(pbi, r);
 }
 
-#if CONFIG_CODE_NONZEROCOUNT
-static uint16_t read_nzc(VP9_COMMON *const cm,
-                         int nzc_context,
-                         TX_SIZE tx_size,
-                         int ref,
-                         int type,
-                         BOOL_DECODER* const bc) {
-  int c, e;
-  uint16_t nzc;
-  if (tx_size == TX_32X32) {
-    c = treed_read(bc, vp9_nzc32x32_tree,
-                   cm->fc.nzc_probs_32x32[nzc_context][ref][type]);
-    cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++;
-  } else if (tx_size == TX_16X16) {
-    c = treed_read(bc, vp9_nzc16x16_tree,
-                   cm->fc.nzc_probs_16x16[nzc_context][ref][type]);
-    cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++;
-  } else if (tx_size == TX_8X8) {
-    c = treed_read(bc, vp9_nzc8x8_tree,
-                   cm->fc.nzc_probs_8x8[nzc_context][ref][type]);
-    cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++;
-  } else if (tx_size == TX_4X4) {
-    c = treed_read(bc, vp9_nzc4x4_tree,
-                   cm->fc.nzc_probs_4x4[nzc_context][ref][type]);
-    cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++;
-  } else {
-    assert(0);
-  }
-  nzc = vp9_basenzcvalue[c];
-  if ((e = vp9_extranzcbits[c])) {
-    int x = 0;
-    while (e--) {
-      int b = vp9_read(
-          bc, cm->fc.nzc_pcat_probs[nzc_context][c - NZC_TOKENS_NOEXTRA][e]);
-      x |= (b << e);
-      cm->fc.nzc_pcat_counts[nzc_context][c - NZC_TOKENS_NOEXTRA][e][b]++;
-    }
-    nzc += x;
-  }
-  if (tx_size == TX_32X32)
-    assert(nzc <= 1024);
-  else if (tx_size == TX_16X16)
-    assert(nzc <= 256);
-  else if (tx_size == TX_8X8)
-    assert(nzc <= 64);
-  else if (tx_size == TX_4X4)
-    assert(nzc <= 16);
-  return nzc;
-}
-
-static void read_nzcs_sb64(VP9_COMMON *const cm,
-                           MACROBLOCKD* xd,
-                           int mb_row,
-                           int mb_col,
-                           BOOL_DECODER* const bc) {
-  MODE_INFO *m = xd->mode_info_context;
-  MB_MODE_INFO *const mi = &m->mbmi;
-  int j, nzc_context;
-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
-
-  assert(mb_col == get_mb_col(xd));
-  assert(mb_row == get_mb_row(xd));
-
-  vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0]));
-
-  if (mi->mb_skip_coeff)
-    return;
-
-  switch (mi->txfm_size) {
-    case TX_32X32:
-      for (j = 0; j < 256; j += 64) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 0, bc);
-      }
-      for (j = 256; j < 384; j += 64) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 1, bc);
-      }
-      break;
-
-    case TX_16X16:
-      for (j = 0; j < 256; j += 16) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, bc);
-      }
-      for (j = 256; j < 384; j += 16) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, bc);
-      }
-      break;
-
-    case TX_8X8:
-      for (j = 0; j < 256; j += 4) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, bc);
-      }
-      for (j = 256; j < 384; j += 4) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc);
-      }
-      break;
-
-    case TX_4X4:
-      for (j = 0; j < 256; ++j) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, bc);
-      }
-      for (j = 256; j < 384; ++j) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc);
-      }
-      break;
-
-    default:
-      break;
-  }
-}
-
-static void read_nzcs_sb32(VP9_COMMON *const cm,
-                           MACROBLOCKD* xd,
-                           int mb_row,
-                           int mb_col,
-                           BOOL_DECODER* const bc) {
-  MODE_INFO *m = xd->mode_info_context;
-  MB_MODE_INFO *const mi = &m->mbmi;
-  int j, nzc_context;
-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
-
-  assert(mb_col == get_mb_col(xd));
-  assert(mb_row == get_mb_row(xd));
-
-  vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0]));
-
-  if (mi->mb_skip_coeff)
-    return;
-
-  switch (mi->txfm_size) {
-    case TX_32X32:
-      for (j = 0; j < 64; j += 64) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 0, bc);
-      }
-      for (j = 64; j < 96; j += 16) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, bc);
-      }
-      break;
-
-    case TX_16X16:
-      for (j = 0; j < 64; j += 16) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, bc);
-      }
-      for (j = 64; j < 96; j += 16) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, bc);
-      }
-      break;
-
-    case TX_8X8:
-      for (j = 0; j < 64; j += 4) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, bc);
-      }
-      for (j = 64; j < 96; j += 4) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc);
-      }
-      break;
-
-    case TX_4X4:
-      for (j = 0; j < 64; ++j) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, bc);
-      }
-      for (j = 64; j < 96; ++j) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc);
-      }
-      break;
-
-    default:
-      break;
-  }
-}
-
-static void read_nzcs_mb16(VP9_COMMON *const cm,
-                           MACROBLOCKD* xd,
-                           int mb_row,
-                           int mb_col,
-                           BOOL_DECODER* const bc) {
-  MODE_INFO *m = xd->mode_info_context;
-  MB_MODE_INFO *const mi = &m->mbmi;
-  int j, nzc_context;
-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
-
-  assert(mb_col == get_mb_col(xd));
-  assert(mb_row == get_mb_row(xd));
-
-  vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0]));
-
-  if (mi->mb_skip_coeff)
-    return;
-
-  switch (mi->txfm_size) {
-    case TX_16X16:
-      for (j = 0; j < 16; j += 16) {
-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, bc);
-      }
-      for (j = 16; j < 24; j += 4) {
-        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc);
-      }
-      break;
-
-    case TX_8X8:
-      for (j = 0; j < 16; j += 4) {
-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, bc);
-      }
-      if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) {
-        for (j = 16; j < 24; ++j) {
-          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-          m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc);
-        }
-      } else {
-        for (j = 16; j < 24; j += 4) {
-          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-          m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc);
-        }
-      }
-      break;
-
-    case TX_4X4:
-      for (j = 0; j < 16; ++j) {
-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, bc);
-      }
-      for (j = 16; j < 24; ++j) {
-        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc);
-      }
-      break;
-
-    default:
-      break;
-  }
-}
-#endif  // CONFIG_CODE_NONZEROCOUNT
-
 void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
                            MACROBLOCKD* const xd,
-                           int mb_row,
-                           int mb_col,
-                           BOOL_DECODER* const bc) {
+                           int mi_row,
+                           int mi_col,
+                           vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MODE_INFO *mi = xd->mode_info_context;
-  MODE_INFO *prev_mi = xd->prev_mode_info_context;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
 
-  if (pbi->common.frame_type == KEY_FRAME) {
-    kfread_modes(pbi, mi, mb_row, mb_col, bc);
+  if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {
+    kfread_modes(pbi, mi, mi_row, mi_col, r);
   } else {
-    read_mb_modes_mv(pbi, mi, &mi->mbmi, prev_mi, mb_row, mb_col, bc);
-    set_scale_factors(xd,
-                      mi->mbmi.ref_frame - 1, mi->mbmi.second_ref_frame - 1,
-                      pbi->common.active_ref_scale);
+    read_mb_modes_mv(pbi, mi, &mi->mbmi, mi_row, mi_col, r);
   }
-#if CONFIG_CODE_NONZEROCOUNT
-  if (mbmi->sb_type == BLOCK_SIZE_SB64X64)
-    read_nzcs_sb64(cm, xd, mb_row, mb_col, bc);
-  else if (mbmi->sb_type == BLOCK_SIZE_SB32X32)
-    read_nzcs_sb32(cm, xd, mb_row, mb_col, bc);
-  else
-    read_nzcs_mb16(cm, xd, mb_row, mb_col, bc);
-#endif  // CONFIG_CODE_NONZEROCOUNT
 
-  if (mbmi->sb_type) {
-    const int n_mbs = 1 << mbmi->sb_type;
-    const int y_mbs = MIN(n_mbs, cm->mb_rows - mb_row);
-    const int x_mbs = MIN(n_mbs, cm->mb_cols - mb_col);
+  if (1) {
+    const int bw = 1 << mi_width_log2(mbmi->sb_type);
+    const int bh = 1 << mi_height_log2(mbmi->sb_type);
+    const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+    const int x_mis = MIN(bw, cm->mi_cols - mi_col);
     const int mis = cm->mode_info_stride;
     int x, y;
 
-    for (y = 0; y < y_mbs; y++) {
-      for (x = !y; x < x_mbs; x++) {
+    for (y = 0; y < y_mis; y++)
+      for (x = !y; x < x_mis; x++)
         mi[y * mis + x] = *mi;
-      }
-    }
-  } else {
-    update_blockd_bmi(xd);
   }
 }
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h
@@ -17,7 +17,7 @@
                            MACROBLOCKD* const xd,
                            int mb_row,
                            int mb_col,
-                           BOOL_DECODER* const bc);
-void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, BOOL_DECODER* const bc);
+                           vp9_reader *r);
+void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, vp9_reader *r);
 
 #endif  // VP9_DECODER_VP9_DECODEMV_H_
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -8,36 +8,32 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 
-#include "vp9/decoder/vp9_onyxd_int.h"
+#include "./vp9_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/vpx_scale.h"
+
+#include "vp9/common/vp9_extend.h"
+#include "vp9/common/vp9_modecont.h"
 #include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_header.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_entropy.h"
-#include "vp9/decoder/vp9_decodframe.h"
-#include "vp9/decoder/vp9_detokenize.h"
 #include "vp9/common/vp9_invtrans.h"
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_quant_common.h"
-#include "vpx_scale/vpx_scale.h"
-#include "vp9/common/vp9_setupintrarecon.h"
-
-#include "vp9/decoder/vp9_decodemv.h"
-#include "vp9/common/vp9_extend.h"
-#include "vp9/common/vp9_modecont.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/decoder/vp9_dboolhuff.h"
-
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_tile_common.h"
-#include "vp9_rtcd.h"
 
-#include <assert.h>
-#include <stdio.h>
+#include "vp9/decoder/vp9_dboolhuff.h"
+#include "vp9/decoder/vp9_decodframe.h"
+#include "vp9/decoder/vp9_detokenize.h"
+#include "vp9/decoder/vp9_decodemv.h"
+#include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/decoder/vp9_read_bit_buffer.h"
 
-#define COEFCOUNT_TESTING
 
 // #define DEC_DEBUG
 #ifdef DEC_DEBUG
@@ -44,24 +40,111 @@
 int dec_debug = 0;
 #endif
 
-static int read_le16(const uint8_t *p) {
-  return (p[1] << 8) | p[0];
+static int read_be32(const uint8_t *p) {
+  return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
 }
 
-static int read_le32(const uint8_t *p) {
-  return (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
-}
-
 // len == 0 is not allowed
-static int read_is_valid(const unsigned char *start, size_t len,
-                         const unsigned char *end) {
+static int read_is_valid(const uint8_t *start, size_t len,
+                         const uint8_t *end) {
   return start + len > start && start + len <= end;
 }
 
+static void setup_txfm_mode(VP9_COMMON *pc, int lossless, vp9_reader *r) {
+  if (lossless) {
+    pc->txfm_mode = ONLY_4X4;
+  } else {
+    pc->txfm_mode = vp9_read_literal(r, 2);
+    if (pc->txfm_mode == ALLOW_32X32)
+      pc->txfm_mode += vp9_read_bit(r);
+    if (pc->txfm_mode == TX_MODE_SELECT) {
+      int i, j;
+      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+        for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j) {
+          if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+            pc->fc.tx_probs_8x8p[i][j] =
+                vp9_read_prob_diff_update(r, pc->fc.tx_probs_8x8p[i][j]);
+        }
+      }
+      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+        for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j) {
+          if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+            pc->fc.tx_probs_16x16p[i][j] =
+                vp9_read_prob_diff_update(r, pc->fc.tx_probs_16x16p[i][j]);
+        }
+      }
+      for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+        for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j) {
+          if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+            pc->fc.tx_probs_32x32p[i][j] =
+                vp9_read_prob_diff_update(r, pc->fc.tx_probs_32x32p[i][j]);
+        }
+      }
+    }
+  }
+}
+
+static int get_unsigned_bits(unsigned int num_values) {
+  int cat = 0;
+  if (num_values <= 1)
+    return 0;
+  num_values--;
+  while (num_values > 0) {
+    cat++;
+    num_values >>= 1;
+  }
+  return cat;
+}
+
+static int inv_recenter_nonneg(int v, int m) {
+  if (v > 2 * m)
+    return v;
+
+  return v % 2 ? m - (v + 1) / 2 : m + v / 2;
+}
+
+static int decode_uniform(vp9_reader *r, int n) {
+  int v;
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  if (!l)
+    return 0;
+
+  v = vp9_read_literal(r, l - 1);
+  return v < m ?  v : (v << 1) - m + vp9_read_bit(r);
+}
+
+static int decode_term_subexp(vp9_reader *r, int k, int num_syms) {
+  int i = 0, mk = 0, word;
+  while (1) {
+    const int b = i ? k + i - 1 : k;
+    const int a = 1 << b;
+    if (num_syms <= mk + 3 * a) {
+      word = decode_uniform(r, num_syms - mk) + mk;
+      break;
+    } else {
+      if (vp9_read_bit(r)) {
+        i++;
+        mk += a;
+      } else {
+        word = vp9_read_literal(r, b) + mk;
+        break;
+      }
+    }
+  }
+  return word;
+}
+
+static int decode_unsigned_max(struct vp9_read_bit_buffer *rb, int max) {
+  const int data = vp9_rb_read_literal(rb, get_unsigned_bits(max));
+  return data > max ? max : data;
+}
+
 static int merge_index(int v, int n, int modulus) {
   int max1 = (n - 1 - modulus / 2) / modulus + 1;
-  if (v < max1) v = v * modulus + modulus / 2;
-  else {
+  if (v < max1) {
+    v = v * modulus + modulus / 2;
+  } else {
     int w;
     v -= max1;
     w = v;
@@ -73,1166 +156,427 @@
 }
 
 static int inv_remap_prob(int v, int m) {
-  const int n = 256;
-  const int modulus = MODULUS_PARAM;
+  const int n = 255;
 
-  v = merge_index(v, n - 1, modulus);
+  v = merge_index(v, n - 1, MODULUS_PARAM);
+  m--;
   if ((m << 1) <= n) {
-    return vp9_inv_recenter_nonneg(v + 1, m);
+    return 1 + inv_recenter_nonneg(v + 1, m);
   } else {
-    return n - 1 - vp9_inv_recenter_nonneg(v + 1, n - 1 - m);
+    return n - inv_recenter_nonneg(v + 1, n - 1 - m);
   }
 }
 
-static vp9_prob read_prob_diff_update(vp9_reader *const bc, int oldp) {
-  int delp = vp9_decode_term_subexp(bc, SUBEXP_PARAM, 255);
+vp9_prob vp9_read_prob_diff_update(vp9_reader *r, int oldp) {
+  int delp = decode_term_subexp(r, SUBEXP_PARAM, 255);
   return (vp9_prob)inv_remap_prob(delp, oldp);
 }
 
-void vp9_init_de_quantizer(VP9D_COMP *pbi) {
-  int i;
+void vp9_init_dequantizer(VP9_COMMON *pc) {
   int q;
-  VP9_COMMON *const pc = &pbi->common;
 
   for (q = 0; q < QINDEX_RANGE; q++) {
-    pc->Y1dequant[q][0] = (int16_t)vp9_dc_quant(q, pc->y1dc_delta_q);
-    pc->UVdequant[q][0] = (int16_t)vp9_dc_uv_quant(q, pc->uvdc_delta_q);
+    // DC value
+    pc->y_dequant[q][0] = vp9_dc_quant(q, pc->y_dc_delta_q);
+    pc->uv_dequant[q][0] = vp9_dc_quant(q, pc->uv_dc_delta_q);
 
-    /* all the ac values =; */
-    for (i = 1; i < 16; i++) {
-      int rc = vp9_default_zig_zag1d_4x4[i];
-
-      pc->Y1dequant[q][rc] = (int16_t)vp9_ac_yquant(q);
-      pc->UVdequant[q][rc] = (int16_t)vp9_ac_uv_quant(q, pc->uvac_delta_q);
-    }
+    // AC values
+    pc->y_dequant[q][1] = vp9_ac_quant(q, 0);
+    pc->uv_dequant[q][1] = vp9_ac_quant(q, pc->uv_ac_delta_q);
   }
 }
 
-static int get_qindex(MACROBLOCKD *mb, int segment_id, int base_qindex) {
-  // Set the Q baseline allowing for any segment level adjustment
-  if (vp9_segfeature_active(mb, segment_id, SEG_LVL_ALT_Q)) {
-    if (mb->mb_segment_abs_delta == SEGMENT_ABSDATA)
-      return vp9_get_segdata(mb, segment_id, SEG_LVL_ALT_Q);  // Abs Value
-    else
-      return clamp(base_qindex + vp9_get_segdata(mb, segment_id, SEG_LVL_ALT_Q),
-                   0, MAXQ);  // Delta Value
-  } else {
-    return base_qindex;
-  }
-}
-
-static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *mb) {
+static void mb_init_dequantizer(VP9_COMMON *pc, MACROBLOCKD *xd) {
   int i;
+  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+  xd->q_index = vp9_get_qindex(xd, segment_id, pc->base_qindex);
 
-  VP9_COMMON *const pc = &pbi->common;
-  const int segment_id = mb->mode_info_context->mbmi.segment_id;
-  const int qindex = get_qindex(mb, segment_id, pc->base_qindex);
-  mb->q_index = qindex;
+  xd->plane[0].dequant = pc->y_dequant[xd->q_index];
+  for (i = 1; i < MAX_MB_PLANE; i++)
+    xd->plane[i].dequant = pc->uv_dequant[xd->q_index];
+}
 
-  for (i = 0; i < 16; i++)
-    mb->block[i].dequant = pc->Y1dequant[qindex];
+static void decode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                         int ss_txfrm_size, void *arg) {
+  MACROBLOCKD* const xd = arg;
+  int16_t* const qcoeff = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
+  const int stride = xd->plane[plane].dst.stride;
+  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
+                                                       block, ss_txfrm_size);
+  uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,
+                                                 raster_block,
+                                                 xd->plane[plane].dst.buf,
+                                                 stride);
 
-  for (i = 16; i < 24; i++)
-    mb->block[i].dequant = pc->UVdequant[qindex];
+  TX_TYPE tx_type;
 
-  if (mb->lossless) {
-    assert(qindex == 0);
-    mb->inv_txm4x4_1      = vp9_short_iwalsh4x4_1;
-    mb->inv_txm4x4        = vp9_short_iwalsh4x4;
-    mb->itxm_add          = vp9_dequant_idct_add_lossless_c;
-    mb->itxm_add_y_block  = vp9_dequant_idct_add_y_block_lossless_c;
-    mb->itxm_add_uv_block = vp9_dequant_idct_add_uv_block_lossless_c;
-  } else {
-    mb->inv_txm4x4_1      = vp9_short_idct4x4_1;
-    mb->inv_txm4x4        = vp9_short_idct4x4;
-    mb->itxm_add          = vp9_dequant_idct_add;
-    mb->itxm_add_y_block  = vp9_dequant_idct_add_y_block;
-    mb->itxm_add_uv_block = vp9_dequant_idct_add_uv_block;
+  switch (ss_txfrm_size / 2) {
+    case TX_4X4:
+      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
+      if (tx_type == DCT_DCT)
+        xd->itxm_add(qcoeff, dst, stride, xd->plane[plane].eobs[block]);
+      else
+        vp9_iht_add_c(tx_type, qcoeff, dst, stride,
+                      xd->plane[plane].eobs[block]);
+      break;
+    case TX_8X8:
+      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
+      vp9_iht_add_8x8_c(tx_type, qcoeff, dst, stride,
+                        xd->plane[plane].eobs[block]);
+      break;
+    case TX_16X16:
+      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
+      vp9_iht_add_16x16_c(tx_type, qcoeff, dst, stride,
+                          xd->plane[plane].eobs[block]);
+      break;
+    case TX_32X32:
+      vp9_idct_add_32x32(qcoeff, dst, stride, xd->plane[plane].eobs[block]);
+      break;
   }
 }
 
-/* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it
- *  to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.
- */
-static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                          int mb_row, int mb_col) {
-  BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
+static void decode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                               int ss_txfrm_size, void *arg) {
+  MACROBLOCKD* const xd = arg;
+  int16_t* const qcoeff = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
+  const int stride = xd->plane[plane].dst.stride;
+  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
+                                                       block, ss_txfrm_size);
+  uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,
+                                                 raster_block,
+                                                 xd->plane[plane].dst.buf,
+                                                 stride);
+  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);
+  TX_TYPE tx_type;
+  int mode, b_mode;
+  int plane_b_size;
+  int tx_ib = raster_block >> tx_size;
+  mode = plane == 0? xd->mode_info_context->mbmi.mode:
+                     xd->mode_info_context->mbmi.uv_mode;
 
-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-    if (sb_type == BLOCK_SIZE_SB64X64) {
-      vp9_build_intra_predictors_sb64uv_s(xd);
-      vp9_build_intra_predictors_sb64y_s(xd);
-    } else if (sb_type == BLOCK_SIZE_SB32X32) {
-      vp9_build_intra_predictors_sbuv_s(xd);
-      vp9_build_intra_predictors_sby_s(xd);
-    } else {
-      vp9_build_intra_predictors_mbuv_s(xd);
-      vp9_build_intra_predictors_mby_s(xd);
-    }
-  } else {
-    if (sb_type == BLOCK_SIZE_SB64X64) {
-      vp9_build_inter64x64_predictors_sb(xd,
-                                         xd->dst.y_buffer,
-                                         xd->dst.u_buffer,
-                                         xd->dst.v_buffer,
-                                         xd->dst.y_stride,
-                                         xd->dst.uv_stride,
-                                         mb_row, mb_col);
-    } else if (sb_type == BLOCK_SIZE_SB32X32) {
-      vp9_build_inter32x32_predictors_sb(xd,
-                                         xd->dst.y_buffer,
-                                         xd->dst.u_buffer,
-                                         xd->dst.v_buffer,
-                                         xd->dst.y_stride,
-                                         xd->dst.uv_stride,
-                                         mb_row, mb_col);
-    } else {
-      vp9_build_inter16x16_predictors_mb(xd,
-                                         xd->dst.y_buffer,
-                                         xd->dst.u_buffer,
-                                         xd->dst.v_buffer,
-                                         xd->dst.y_stride,
-                                         xd->dst.uv_stride,
-                                         mb_row, mb_col);
-    }
-  }
-}
 
-static void decode_16x16(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                         BOOL_DECODER* const bc) {
-  TX_TYPE tx_type = get_tx_type_16x16(xd, 0);
-#if 0  // def DEC_DEBUG
-  if (dec_debug) {
-    int i;
-    printf("\n");
-    printf("qcoeff 16x16\n");
-    for (i = 0; i < 400; i++) {
-      printf("%3d ", xd->qcoeff[i]);
-      if (i % 16 == 15) printf("\n");
-    }
-    printf("\n");
-    printf("predictor\n");
-    for (i = 0; i < 400; i++) {
-      printf("%3d ", xd->predictor[i]);
-      if (i % 16 == 15) printf("\n");
-    }
-  }
-#endif
-  if (tx_type != DCT_DCT) {
-    vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff,
-                                    xd->block[0].dequant, xd->predictor,
-                                    xd->dst.y_buffer, 16, xd->dst.y_stride,
-                                    xd->eobs[0]);
+  if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 && plane == 0) {
+    assert(bsize == BLOCK_SIZE_SB8X8);
+    b_mode = xd->mode_info_context->bmi[raster_block].as_mode.first;
   } else {
-    vp9_dequant_idct_add_16x16(xd->qcoeff, xd->block[0].dequant,
-                               xd->predictor, xd->dst.y_buffer,
-                               16, xd->dst.y_stride, xd->eobs[0]);
+    b_mode = mode;
   }
-  vp9_dequant_idct_add_uv_block_8x8(
-      xd->qcoeff + 16 * 16, xd->block[16].dequant,
-      xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
-      xd->dst.uv_stride, xd);
-}
 
-static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                       BOOL_DECODER* const bc) {
-  // First do Y
-  // if the first one is DCT_DCT assume all the rest are as well
-  TX_TYPE tx_type = get_tx_type_8x8(xd, 0);
-#if 0  // def DEC_DEBUG
-  if (dec_debug) {
-    int i;
-    printf("\n");
-    printf("qcoeff 8x8\n");
-    for (i = 0; i < 384; i++) {
-      printf("%3d ", xd->qcoeff[i]);
-      if (i % 16 == 15) printf("\n");
-    }
+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
+    extend_for_intra(xd, plane, block, bsize, ss_txfrm_size);
   }
-#endif
-  if (tx_type != DCT_DCT || xd->mode_info_context->mbmi.mode == I8X8_PRED) {
-    int i;
-    for (i = 0; i < 4; i++) {
-      int ib = vp9_i8x8_block[i];
-      int idx = (ib & 0x02) ? (ib + 2) : ib;
-      int16_t *q  = xd->block[idx].qcoeff;
-      int16_t *dq = xd->block[0].dequant;
-      uint8_t *pre = xd->block[ib].predictor;
-      uint8_t *dst = *(xd->block[ib].base_dst) + xd->block[ib].dst;
-      int stride = xd->dst.y_stride;
-      BLOCKD *b = &xd->block[ib];
-      if (xd->mode_info_context->mbmi.mode == I8X8_PRED) {
-        int i8x8mode = b->bmi.as_mode.first;
-        vp9_intra8x8_predict(xd, b, i8x8mode, b->predictor);
-      }
-      tx_type = get_tx_type_8x8(xd, ib);
-      if (tx_type != DCT_DCT) {
-        vp9_ht_dequant_idct_add_8x8_c(tx_type, q, dq, pre, dst, 16, stride,
-                                      xd->eobs[idx]);
-      } else {
-        vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride,
-                                   xd->eobs[idx]);
-      }
-    }
-  } else {
-    vp9_dequant_idct_add_y_block_8x8(xd->qcoeff,
-                                     xd->block[0].dequant,
-                                     xd->predictor,
-                                     xd->dst.y_buffer,
-                                     xd->dst.y_stride,
-                                     xd);
-  }
 
-  // Now do UV
-  if (xd->mode_info_context->mbmi.mode == I8X8_PRED) {
-    int i;
-    for (i = 0; i < 4; i++) {
-      int ib = vp9_i8x8_block[i];
-      BLOCKD *b = &xd->block[ib];
-      int i8x8mode = b->bmi.as_mode.first;
+  plane_b_size = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size,
+                          b_mode, dst, xd->plane[plane].dst.stride);
 
-      b = &xd->block[16 + i];
-      vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor);
-      xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
-                   *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[16 + i]);
-
-      b = &xd->block[20 + i];
-      vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor);
-      xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
-                   *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[20 + i]);
-    }
-  } else if (xd->mode_info_context->mbmi.mode == SPLITMV) {
-    xd->itxm_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant,
-         xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
-         xd->dst.uv_stride, xd);
-  } else {
-    vp9_dequant_idct_add_uv_block_8x8
-        (xd->qcoeff + 16 * 16, xd->block[16].dequant,
-         xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
-         xd->dst.uv_stride, xd);
+  switch (ss_txfrm_size / 2) {
+    case TX_4X4:
+      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
+      if (tx_type == DCT_DCT)
+        xd->itxm_add(qcoeff, dst, stride, xd->plane[plane].eobs[block]);
+      else
+        vp9_iht_add_c(tx_type, qcoeff, dst, stride,
+                      xd->plane[plane].eobs[block]);
+      break;
+    case TX_8X8:
+      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
+      vp9_iht_add_8x8_c(tx_type, qcoeff, dst, stride,
+                        xd->plane[plane].eobs[block]);
+      break;
+    case TX_16X16:
+      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
+      vp9_iht_add_16x16_c(tx_type, qcoeff, dst, stride,
+                          xd->plane[plane].eobs[block]);
+      break;
+    case TX_32X32:
+      vp9_idct_add_32x32(qcoeff, dst, stride, xd->plane[plane].eobs[block]);
+      break;
   }
-#if 0  // def DEC_DEBUG
-  if (dec_debug) {
-    int i;
-    printf("\n");
-    printf("predictor\n");
-    for (i = 0; i < 384; i++) {
-      printf("%3d ", xd->predictor[i]);
-      if (i % 16 == 15) printf("\n");
-    }
-  }
-#endif
 }
 
-static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                       BOOL_DECODER* const bc) {
-  TX_TYPE tx_type;
-  int i, eobtotal = 0;
-  MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
-#if 0  // def DEC_DEBUG
-  if (dec_debug) {
-    int i;
-    printf("\n");
-    printf("predictor\n");
-    for (i = 0; i < 384; i++) {
-      printf("%3d ", xd->predictor[i]);
-      if (i % 16 == 15) printf("\n");
-    }
-  }
-#endif
-  if (mode == I8X8_PRED) {
-    for (i = 0; i < 4; i++) {
-      int ib = vp9_i8x8_block[i];
-      const int iblock[4] = {0, 1, 4, 5};
-      int j;
-      BLOCKD *b = &xd->block[ib];
-      int i8x8mode = b->bmi.as_mode.first;
-      vp9_intra8x8_predict(xd, b, i8x8mode, b->predictor);
-      for (j = 0; j < 4; j++) {
-        b = &xd->block[ib + iblock[j]];
-        tx_type = get_tx_type_4x4(xd, ib + iblock[j]);
-        if (tx_type != DCT_DCT) {
-          vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
-                                    b->dequant, b->predictor,
-                                    *(b->base_dst) + b->dst, 16,
-                                    b->dst_stride, xd->eobs[ib + iblock[j]]);
-        } else {
-          xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
-                       *(b->base_dst) + b->dst, 16, b->dst_stride,
-                       xd->eobs[ib + iblock[j]]);
-        }
-      }
-      b = &xd->block[16 + i];
-      vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor);
-      xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
-                   *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[16 + i]);
-      b = &xd->block[20 + i];
-      vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor);
-      xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
-                   *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[20 + i]);
-    }
-  } else if (mode == B_PRED) {
-    for (i = 0; i < 16; i++) {
-      BLOCKD *b = &xd->block[i];
-      int b_mode = xd->mode_info_context->bmi[i].as_mode.first;
-#if CONFIG_NEWBINTRAMODES
-      xd->mode_info_context->bmi[i].as_mode.context = b->bmi.as_mode.context =
-          vp9_find_bpred_context(xd, b);
-#endif
-      if (!xd->mode_info_context->mbmi.mb_skip_coeff)
-        eobtotal += vp9_decode_coefs_4x4(pbi, xd, bc, PLANE_TYPE_Y_WITH_DC, i);
+static void decode_atom(VP9D_COMP *pbi, MACROBLOCKD *xd,
+                        int mi_row, int mi_col,
+                        vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
-      vp9_intra4x4_predict(xd, b, b_mode, b->predictor);
-      tx_type = get_tx_type_4x4(xd, i);
-      if (tx_type != DCT_DCT) {
-        vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
-                                  b->dequant, b->predictor,
-                                  *(b->base_dst) + b->dst, 16, b->dst_stride,
-                                  xd->eobs[i]);
-      } else {
-        xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
-                      *(b->base_dst) + b->dst, 16, b->dst_stride, xd->eobs[i]);
-      }
-    }
-    if (!xd->mode_info_context->mbmi.mb_skip_coeff) {
-      vp9_decode_mb_tokens_4x4_uv(pbi, xd, bc);
-    }
-    vp9_build_intra_predictors_mbuv(xd);
-    xd->itxm_add_uv_block(xd->qcoeff + 16 * 16,
-                           xd->block[16].dequant,
-                           xd->predictor + 16 * 16,
-                           xd->dst.u_buffer,
-                           xd->dst.v_buffer,
-                           xd->dst.uv_stride,
-                           xd);
-  } else if (mode == SPLITMV || get_tx_type_4x4(xd, 0) == DCT_DCT) {
-    xd->itxm_add_y_block(xd->qcoeff,
-                          xd->block[0].dequant,
-                          xd->predictor,
-                          xd->dst.y_buffer,
-                          xd->dst.y_stride,
-                          xd);
-    xd->itxm_add_uv_block(xd->qcoeff + 16 * 16,
-                           xd->block[16].dequant,
-                           xd->predictor + 16 * 16,
-                           xd->dst.u_buffer,
-                           xd->dst.v_buffer,
-                           xd->dst.uv_stride,
-                           xd);
-  } else {
-#if 0  // def DEC_DEBUG
-    if (dec_debug) {
-      int i;
-      printf("\n");
-      printf("qcoeff 4x4\n");
-      for (i = 0; i < 400; i++) {
-        printf("%3d ", xd->qcoeff[i]);
-        if (i % 16 == 15) printf("\n");
-      }
-      printf("\n");
-      printf("predictor\n");
-      for (i = 0; i < 400; i++) {
-        printf("%3d ", xd->predictor[i]);
-        if (i % 16 == 15) printf("\n");
-      }
-    }
-#endif
-    for (i = 0; i < 16; i++) {
-      BLOCKD *b = &xd->block[i];
-      tx_type = get_tx_type_4x4(xd, i);
-      if (tx_type != DCT_DCT) {
-        vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
-                                  b->dequant, b->predictor,
-                                  *(b->base_dst) + b->dst, 16,
-                                  b->dst_stride, xd->eobs[i]);
-      } else {
-        xd->itxm_add(b->qcoeff, b->dequant, b->predictor,
-                      *(b->base_dst) + b->dst, 16, b->dst_stride, xd->eobs[i]);
-      }
-    }
-    xd->itxm_add_uv_block(xd->qcoeff + 16 * 16,
-                           xd->block[16].dequant,
-                           xd->predictor + 16 * 16,
-                           xd->dst.u_buffer,
-                           xd->dst.v_buffer,
-                           xd->dst.uv_stride,
-                           xd);
-  }
-}
+  assert(mbmi->ref_frame[0] != INTRA_FRAME);
 
-static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                                int mb_row, int mb_col,
-                                BOOL_DECODER* const bc) {
-  int n, eobtotal;
-  VP9_COMMON *const pc = &pbi->common;
-  MODE_INFO *mi = xd->mode_info_context;
-  const int mis = pc->mode_info_stride;
+  if ((pbi->common.frame_type != KEY_FRAME) && (!pbi->common.intra_only))
+    vp9_setup_interp_filters(xd, mbmi->interp_filter, &pbi->common);
 
-  assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64);
+  // prediction
+  vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
 
-  if (pbi->common.frame_type != KEY_FRAME)
-    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, pc);
+  if (mbmi->mb_skip_coeff) {
+    vp9_reset_sb_tokens_context(xd, bsize);
+  } else {
+    // re-initialize macroblock dequantizer before detokenization
+    if (xd->segmentation_enabled)
+      mb_init_dequantizer(&pbi->common, xd);
 
-  // re-initialize macroblock dequantizer before detokenization
-  if (xd->segmentation_enabled)
-    mb_init_dequantizer(pbi, xd);
-
-  if (xd->mode_info_context->mbmi.mb_skip_coeff) {
-    vp9_reset_sb64_tokens_context(xd);
-
-    /* Special case:  Force the loopfilter to skip when eobtotal and
-     * mb_skip_coeff are zero.
-     */
-    skip_recon_mb(pbi, xd, mb_row, mb_col);
-    return;
+    if (!vp9_reader_has_error(r)) {
+      vp9_decode_tokens(pbi, xd, r, bsize);
+    }
   }
+  foreach_transformed_block(xd, bsize, decode_block, xd);
+}
 
-  /* do prediction */
-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-    vp9_build_intra_predictors_sb64y_s(xd);
-    vp9_build_intra_predictors_sb64uv_s(xd);
+static void decode_sb_intra(VP9D_COMP *pbi, MACROBLOCKD *xd,
+                          int mi_row, int mi_col,
+                          vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  if (mbmi->mb_skip_coeff) {
+    vp9_reset_sb_tokens_context(xd, bsize);
   } else {
-    vp9_build_inter64x64_predictors_sb(xd, xd->dst.y_buffer,
-                                       xd->dst.u_buffer, xd->dst.v_buffer,
-                                       xd->dst.y_stride, xd->dst.uv_stride,
-                                       mb_row, mb_col);
-  }
+    // re-initialize macroblock dequantizer before detokenization
+    if (xd->segmentation_enabled)
+      mb_init_dequantizer(&pbi->common, xd);
 
-  /* dequantization and idct */
-  eobtotal = vp9_decode_sb64_tokens(pbi, xd, bc);
-  if (eobtotal == 0) {  // skip loopfilter
-    for (n = 0; n < 16; n++) {
-      const int x_idx = n & 3, y_idx = n >> 2;
-
-      if (mb_col + x_idx < pc->mb_cols && mb_row + y_idx < pc->mb_rows)
-        mi[y_idx * mis + x_idx].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
+    if (!vp9_reader_has_error(r)) {
+      vp9_decode_tokens(pbi, xd, r, bsize);
     }
-  } else {
-    switch (xd->mode_info_context->mbmi.txfm_size) {
-      case TX_32X32:
-        for (n = 0; n < 4; n++) {
-          const int x_idx = n & 1, y_idx = n >> 1;
-          const int y_offset = x_idx * 32 + y_idx * xd->dst.y_stride * 32;
-          vp9_dequant_idct_add_32x32(xd->qcoeff + n * 1024,
-              xd->block[0].dequant,
-              xd->dst.y_buffer + y_offset,
-              xd->dst.y_buffer + y_offset,
-              xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 64]);
-        }
-        vp9_dequant_idct_add_32x32(xd->qcoeff + 4096,
-            xd->block[16].dequant, xd->dst.u_buffer, xd->dst.u_buffer,
-            xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[256]);
-        vp9_dequant_idct_add_32x32(xd->qcoeff + 4096 + 1024,
-            xd->block[20].dequant, xd->dst.v_buffer, xd->dst.v_buffer,
-            xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320]);
-        break;
-      case TX_16X16:
-        for (n = 0; n < 16; n++) {
-          const int x_idx = n & 3, y_idx = n >> 2;
-          const int y_offset = y_idx * 16 * xd->dst.y_stride + x_idx * 16;
-          const TX_TYPE tx_type = get_tx_type_16x16(xd,
-                                                    (y_idx * 16 + x_idx) * 4);
-
-          if (tx_type == DCT_DCT) {
-            vp9_dequant_idct_add_16x16(xd->qcoeff + n * 256,
-                xd->block[0].dequant,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]);
-          } else {
-            vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff + n * 256,
-                xd->block[0].dequant,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]);
-          }
-        }
-        for (n = 0; n < 4; n++) {
-          const int x_idx = n & 1, y_idx = n >> 1;
-          const int uv_offset = y_idx * 16 * xd->dst.uv_stride + x_idx * 16;
-          vp9_dequant_idct_add_16x16(xd->qcoeff + 4096 + n * 256,
-              xd->block[16].dequant,
-              xd->dst.u_buffer + uv_offset,
-              xd->dst.u_buffer + uv_offset,
-              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[256 + n * 16]);
-          vp9_dequant_idct_add_16x16(xd->qcoeff + 4096 + 1024 + n * 256,
-              xd->block[20].dequant,
-              xd->dst.v_buffer + uv_offset,
-              xd->dst.v_buffer + uv_offset,
-              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320 + n * 16]);
-        }
-        break;
-      case TX_8X8:
-        for (n = 0; n < 64; n++) {
-          const int x_idx = n & 7, y_idx = n >> 3;
-          const int y_offset = y_idx * 8 * xd->dst.y_stride + x_idx * 8;
-          const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 16 + x_idx) * 2);
-          if (tx_type == DCT_DCT) {
-            vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64,
-                xd->block[0].dequant,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]);
-          } else {
-            vp9_ht_dequant_idct_add_8x8_c(tx_type, xd->qcoeff + n * 64,
-                xd->block[0].dequant,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]);
-          }
-        }
-        for (n = 0; n < 16; n++) {
-          const int x_idx = n & 3, y_idx = n >> 2;
-          const int uv_offset = y_idx * 8 * xd->dst.uv_stride + x_idx * 8;
-          vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64 + 4096,
-              xd->block[16].dequant,
-              xd->dst.u_buffer + uv_offset,
-              xd->dst.u_buffer + uv_offset,
-              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[256 + n * 4]);
-          vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64 + 4096 + 1024,
-              xd->block[20].dequant,
-              xd->dst.v_buffer + uv_offset,
-              xd->dst.v_buffer + uv_offset,
-              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320 + n * 4]);
-        }
-        break;
-      case TX_4X4:
-        for (n = 0; n < 256; n++) {
-          const int x_idx = n & 15, y_idx = n >> 4;
-          const int y_offset = y_idx * 4 * xd->dst.y_stride + x_idx * 4;
-          const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 16 + x_idx);
-          if (tx_type == DCT_DCT) {
-            xd->itxm_add(xd->qcoeff + n * 16, xd->block[0].dequant,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]);
-          } else {
-            vp9_ht_dequant_idct_add_c(tx_type, xd->qcoeff + n * 16,
-                xd->block[0].dequant,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]);
-          }
-        }
-        for (n = 0; n < 64; n++) {
-          const int x_idx = n & 7, y_idx = n >> 3;
-          const int uv_offset = y_idx * 4 * xd->dst.uv_stride + x_idx * 4;
-          xd->itxm_add(xd->qcoeff + 4096 + n * 16,
-              xd->block[16].dequant,
-              xd->dst.u_buffer + uv_offset,
-              xd->dst.u_buffer + uv_offset,
-              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[256 + n]);
-          xd->itxm_add(xd->qcoeff + 4096 + 1024 + n * 16,
-              xd->block[20].dequant,
-              xd->dst.v_buffer + uv_offset,
-              xd->dst.v_buffer + uv_offset,
-              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320 + n]);
-        }
-        break;
-      default: assert(0);
-    }
   }
+
+  foreach_transformed_block(xd, bsize, decode_block_intra, xd);
 }
 
-static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                                int mb_row, int mb_col,
-                                BOOL_DECODER* const bc) {
+
+static void decode_sb(VP9D_COMP *pbi, MACROBLOCKD *xd, int mi_row, int mi_col,
+                      vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
+  const int bwl = mi_width_log2(bsize), bhl = mi_height_log2(bsize);
+  const int bw = 1 << bwl, bh = 1 << bhl;
   int n, eobtotal;
   VP9_COMMON *const pc = &pbi->common;
+  MODE_INFO *const mi = xd->mode_info_context;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
   const int mis = pc->mode_info_stride;
 
-  assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32);
+  assert(mbmi->sb_type == bsize);
+  assert(mbmi->ref_frame[0] != INTRA_FRAME);
 
   if (pbi->common.frame_type != KEY_FRAME)
-    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, pc);
+    vp9_setup_interp_filters(xd, mbmi->interp_filter, pc);
 
-  // re-initialize macroblock dequantizer before detokenization
-  if (xd->segmentation_enabled)
-    mb_init_dequantizer(pbi, xd);
+  // generate prediction
+  vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
 
-  if (xd->mode_info_context->mbmi.mb_skip_coeff) {
-    vp9_reset_sb_tokens_context(xd);
-
-    /* Special case:  Force the loopfilter to skip when eobtotal and
-     * mb_skip_coeff are zero.
-     */
-    skip_recon_mb(pbi, xd, mb_row, mb_col);
-    return;
-  }
-
-  /* do prediction */
-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-    vp9_build_intra_predictors_sby_s(xd);
-    vp9_build_intra_predictors_sbuv_s(xd);
+  if (mbmi->mb_skip_coeff) {
+    vp9_reset_sb_tokens_context(xd, bsize);
   } else {
-    vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
-                                       xd->dst.u_buffer, xd->dst.v_buffer,
-                                       xd->dst.y_stride, xd->dst.uv_stride,
-                                       mb_row, mb_col);
-  }
+    // re-initialize macroblock dequantizer before detokenization
+    if (xd->segmentation_enabled)
+      mb_init_dequantizer(pc, xd);
 
-  /* dequantization and idct */
-  eobtotal = vp9_decode_sb_tokens(pbi, xd, bc);
-  if (eobtotal == 0) {  // skip loopfilter
-    xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-    if (mb_col + 1 < pc->mb_cols)
-      xd->mode_info_context[1].mbmi.mb_skip_coeff = 1;
-    if (mb_row + 1 < pc->mb_rows) {
-      xd->mode_info_context[mis].mbmi.mb_skip_coeff = 1;
-      if (mb_col + 1 < pc->mb_cols)
-        xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = 1;
-    }
-  } else {
-    switch (xd->mode_info_context->mbmi.txfm_size) {
-      case TX_32X32:
-        vp9_dequant_idct_add_32x32(xd->qcoeff, xd->block[0].dequant,
-                                   xd->dst.y_buffer, xd->dst.y_buffer,
-                                   xd->dst.y_stride, xd->dst.y_stride,
-                                   xd->eobs[0]);
-        vp9_dequant_idct_add_uv_block_16x16_c(xd->qcoeff + 1024,
-                                              xd->block[16].dequant,
-                                              xd->dst.u_buffer,
-                                              xd->dst.v_buffer,
-                                              xd->dst.uv_stride, xd);
-        break;
-      case TX_16X16:
-        for (n = 0; n < 4; n++) {
-          const int x_idx = n & 1, y_idx = n >> 1;
-          const int y_offset = y_idx * 16 * xd->dst.y_stride + x_idx * 16;
-          const TX_TYPE tx_type = get_tx_type_16x16(xd,
-                                                    (y_idx * 8 + x_idx) * 4);
-          if (tx_type == DCT_DCT) {
-            vp9_dequant_idct_add_16x16(
-                xd->qcoeff + n * 256, xd->block[0].dequant,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]);
-          } else {
-            vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff + n * 256,
-                xd->block[0].dequant,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]);
-          }
-        }
-        vp9_dequant_idct_add_uv_block_16x16_c(xd->qcoeff + 1024,
-                                              xd->block[16].dequant,
-                                              xd->dst.u_buffer,
-                                              xd->dst.v_buffer,
-                                              xd->dst.uv_stride, xd);
-        break;
-      case TX_8X8:
-        for (n = 0; n < 16; n++) {
-          const int x_idx = n & 3, y_idx = n >> 2;
-          const int y_offset = y_idx * 8 * xd->dst.y_stride + x_idx * 8;
-          const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 8 + x_idx) * 2);
-          if (tx_type == DCT_DCT) {
-            vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64,
-                xd->block[0].dequant,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]);
-          } else {
-            vp9_ht_dequant_idct_add_8x8_c(tx_type, xd->qcoeff + n * 64,
-                xd->block[0].dequant,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]);
-          }
-        }
-        for (n = 0; n < 4; n++) {
-          const int x_idx = n & 1, y_idx = n >> 1;
-          const int uv_offset = y_idx * 8 * xd->dst.uv_stride + x_idx * 8;
-          vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64 + 1024,
-              xd->block[16].dequant,
-              xd->dst.u_buffer + uv_offset,
-              xd->dst.u_buffer + uv_offset,
-              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[64 + n * 4]);
-          vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64 + 1280,
-              xd->block[20].dequant,
-              xd->dst.v_buffer + uv_offset,
-              xd->dst.v_buffer + uv_offset,
-              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[80 + n * 4]);
-        }
-        break;
-      case TX_4X4:
-        for (n = 0; n < 64; n++) {
-          const int x_idx = n & 7, y_idx = n >> 3;
-          const int y_offset = y_idx * 4 * xd->dst.y_stride + x_idx * 4;
+    // dequantization and idct
+    eobtotal = vp9_decode_tokens(pbi, xd, r, bsize);
+    if (eobtotal == 0) {  // skip loopfilter
+      for (n = 0; n < bw * bh; n++) {
+        const int x_idx = n & (bw - 1), y_idx = n >> bwl;
 
-          const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 8 + x_idx);
-          if (tx_type == DCT_DCT) {
-            xd->itxm_add(xd->qcoeff + n * 16, xd->block[0].dequant,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]);
-          } else {
-            vp9_ht_dequant_idct_add_c(tx_type, xd->qcoeff + n * 16,
-                xd->block[0].dequant,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_buffer + y_offset,
-                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]);
-          }
-        }
-
-        for (n = 0; n < 16; n++) {
-          const int x_idx = n & 3, y_idx = n >> 2;
-          const int uv_offset = y_idx * 4 * xd->dst.uv_stride + x_idx * 4;
-          xd->itxm_add(xd->qcoeff + 1024 + n * 16,
-              xd->block[16].dequant,
-              xd->dst.u_buffer + uv_offset,
-              xd->dst.u_buffer + uv_offset,
-              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[64 + n]);
-          xd->itxm_add(xd->qcoeff + 1280 + n * 16,
-              xd->block[20].dequant,
-              xd->dst.v_buffer + uv_offset,
-              xd->dst.v_buffer + uv_offset,
-              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[80 + n]);
-        }
-        break;
-      default: assert(0);
-    }
-  }
-}
-
-static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                              int mb_row, unsigned int mb_col,
-                              BOOL_DECODER* const bc) {
-  int eobtotal = 0;
-  MB_PREDICTION_MODE mode;
-  int tx_size;
-
-  assert(!xd->mode_info_context->mbmi.sb_type);
-
-  // re-initialize macroblock dequantizer before detokenization
-  if (xd->segmentation_enabled)
-    mb_init_dequantizer(pbi, xd);
-
-  tx_size = xd->mode_info_context->mbmi.txfm_size;
-  mode = xd->mode_info_context->mbmi.mode;
-
-  if (xd->mode_info_context->mbmi.mb_skip_coeff) {
-    vp9_reset_mb_tokens_context(xd);
-  } else if (!bool_error(bc)) {
-    if (mode != B_PRED)
-      eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
-  }
-
-  //mode = xd->mode_info_context->mbmi.mode;
-  if (pbi->common.frame_type != KEY_FRAME)
-    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter,
-                             &pbi->common);
-
-  if (eobtotal == 0 &&
-      mode != B_PRED &&
-      mode != SPLITMV &&
-      mode != I8X8_PRED &&
-      !bool_error(bc)) {
-    /* Special case:  Force the loopfilter to skip when eobtotal and
-       mb_skip_coeff are zero. */
-    xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-    skip_recon_mb(pbi, xd, mb_row, mb_col);
-    return;
-  }
-#if 0  // def DEC_DEBUG
-  if (dec_debug)
-    printf("Decoding mb:  %d %d\n", xd->mode_info_context->mbmi.mode, tx_size);
-#endif
-
-  // moved to be performed before detokenization
-  //  if (xd->segmentation_enabled)
-  //    mb_init_dequantizer(pbi, xd);
-
-  /* do prediction */
-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-    if (mode != I8X8_PRED) {
-      vp9_build_intra_predictors_mbuv(xd);
-      if (mode != B_PRED) {
-        vp9_build_intra_predictors_mby(xd);
+        if (mi_col + x_idx < pc->mi_cols && mi_row + y_idx < pc->mi_rows)
+          mi[y_idx * mis + x_idx].mbmi.mb_skip_coeff = 1;
       }
+    } else {
+      foreach_transformed_block(xd, bsize, decode_block, xd);
     }
-  } else {
-#if 0  // def DEC_DEBUG
-  if (dec_debug)
-    printf("Decoding mb:  %d %d interp %d\n",
-           xd->mode_info_context->mbmi.mode, tx_size,
-           xd->mode_info_context->mbmi.interp_filter);
-#endif
-    vp9_build_inter_predictors_mb(xd, mb_row, mb_col);
   }
-
-  if (tx_size == TX_16X16) {
-    decode_16x16(pbi, xd, bc);
-  } else if (tx_size == TX_8X8) {
-    decode_8x8(pbi, xd, bc);
-  } else {
-    decode_4x4(pbi, xd, bc);
-  }
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int i, j;
-    printf("\n");
-    printf("predictor y\n");
-    for (i = 0; i < 16; i++) {
-      for (j = 0; j < 16; j++)
-        printf("%3d ", xd->predictor[i * 16 + j]);
-      printf("\n");
-    }
-    printf("\n");
-    printf("final y\n");
-    for (i = 0; i < 16; i++) {
-      for (j = 0; j < 16; j++)
-        printf("%3d ", xd->dst.y_buffer[i * xd->dst.y_stride + j]);
-      printf("\n");
-    }
-    printf("\n");
-    printf("final u\n");
-    for (i = 0; i < 8; i++) {
-      for (j = 0; j < 8; j++)
-        printf("%3d ", xd->dst.u_buffer[i * xd->dst.uv_stride + j]);
-      printf("\n");
-    }
-    printf("\n");
-    printf("final v\n");
-    for (i = 0; i < 8; i++) {
-      for (j = 0; j < 8; j++)
-        printf("%3d ", xd->dst.v_buffer[i * xd->dst.uv_stride + j]);
-      printf("\n");
-    }
-    fflush(stdout);
-  }
-#endif
 }
 
-
-static int get_delta_q(vp9_reader *bc, int prev, int *q_update) {
-  int ret_val = 0;
-
-  if (vp9_read_bit(bc)) {
-    ret_val = vp9_read_literal(bc, 4);
-
-    if (vp9_read_bit(bc))
-      ret_val = -ret_val;
-  }
-
-  /* Trigger a quantizer update if the delta-q value has changed */
-  if (ret_val != prev)
-    *q_update = 1;
-
-  return ret_val;
-}
-
-#ifdef PACKET_TESTING
-#include <stdio.h>
-FILE *vpxlog = 0;
-#endif
-
-static void set_offsets(VP9D_COMP *pbi, int block_size,
-                        int mb_row, int mb_col) {
+static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize,
+                        int mi_row, int mi_col) {
+  const int bh = 1 << mi_height_log2(bsize);
+  const int bw = 1 << mi_width_log2(bsize);
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  const int mis = cm->mode_info_stride;
-  const int idx = mis * mb_row + mb_col;
-  const int dst_fb_idx = cm->new_fb_idx;
-  const int recon_y_stride = cm->yv12_fb[dst_fb_idx].y_stride;
-  const int recon_uv_stride = cm->yv12_fb[dst_fb_idx].uv_stride;
-  const int recon_yoffset = mb_row * 16 * recon_y_stride + 16 * mb_col;
-  const int recon_uvoffset = mb_row * 8 * recon_uv_stride + 8 * mb_col;
+  const int mi_idx = mi_row * cm->mode_info_stride + mi_col;
+  int i;
 
-  xd->mode_info_context = cm->mi + idx;
-  xd->mode_info_context->mbmi.sb_type = block_size >> 5;
-  xd->prev_mode_info_context = cm->prev_mi + idx;
-  xd->above_context = cm->above_context + mb_col;
-  xd->left_context = cm->left_context + (mb_row & 3);
+  xd->mode_info_context = cm->mi + mi_idx;
+  xd->mode_info_context->mbmi.sb_type = bsize;
+  // Special case: if prev_mi is NULL, the previous mode info context
+  // cannot be used.
+  xd->prev_mode_info_context = cm->prev_mi ?
+                                 cm->prev_mi + mi_idx : NULL;
 
-  // Distance of Mb to the various image edges.
-  // These are specified to 8th pel as they are always compared to
-  // values that are in 1/8th pel units
-  block_size >>= 4;  // in mb units
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].above_context = cm->above_context[i] +
+        (mi_col * 2 >> xd->plane[i].subsampling_x);
+    xd->plane[i].left_context = cm->left_context[i] +
+        (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y);
+  }
+  xd->above_seg_context = cm->above_seg_context + mi_col;
+  xd->left_seg_context  = cm->left_seg_context + (mi_row & MI_MASK);
 
-  set_mb_row(cm, xd, mb_row, block_size);
-  set_mb_col(cm, xd, mb_col, block_size);
+  // Distance of Mb to the various image edges. These are specified to 8th pel
+  // as they are always compared to values that are in 1/8th pel units
+  set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
 
-  xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-  xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-  xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+  setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], mi_row, mi_col);
 }
 
-static void set_refs(VP9D_COMP *pbi, int block_size, int mb_row, int mb_col) {
+static void set_refs(VP9D_COMP *pbi, int mi_row, int mi_col) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
-  if (mbmi->ref_frame > INTRA_FRAME) {
+  if (mbmi->ref_frame[0] > INTRA_FRAME) {
     // Select the appropriate reference frame for this MB
-    int ref_fb_idx = cm->active_ref_idx[mbmi->ref_frame - 1];
-    xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame - 1];
-    xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame - 1];
-    setup_pred_block(&xd->pre, &cm->yv12_fb[ref_fb_idx], mb_row, mb_col,
-                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);
+    const int fb_idx = cm->active_ref_idx[mbmi->ref_frame[0] - 1];
+    const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[fb_idx];
+    xd->scale_factor[0]    = cm->active_ref_scale[mbmi->ref_frame[0] - 1];
+    xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1];
+    setup_pre_planes(xd, cfg, NULL, mi_row, mi_col,
+                     xd->scale_factor, xd->scale_factor_uv);
+    xd->corrupted |= cfg->corrupted;
 
-    // propagate errors from reference frames
-    xd->corrupted |= cm->yv12_fb[ref_fb_idx].corrupted;
-
-    if (mbmi->second_ref_frame > INTRA_FRAME) {
+    if (mbmi->ref_frame[1] > INTRA_FRAME) {
       // Select the appropriate reference frame for this MB
-      int second_ref_fb_idx = cm->active_ref_idx[mbmi->second_ref_frame - 1];
-
-      setup_pred_block(&xd->second_pre, &cm->yv12_fb[second_ref_fb_idx],
-                       mb_row, mb_col,
-                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);
-
-      // propagate errors from reference frames
-      xd->corrupted |= cm->yv12_fb[second_ref_fb_idx].corrupted;
+      const int second_fb_idx = cm->active_ref_idx[mbmi->ref_frame[1] - 1];
+      const YV12_BUFFER_CONFIG *second_cfg = &cm->yv12_fb[second_fb_idx];
+      xd->scale_factor[1]    = cm->active_ref_scale[mbmi->ref_frame[1] - 1];
+      xd->scale_factor_uv[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1];
+      setup_pre_planes(xd, NULL, second_cfg, mi_row, mi_col,
+                       xd->scale_factor, xd->scale_factor_uv);
+      xd->corrupted |= second_cfg->corrupted;
     }
   }
 }
 
-/* Decode a row of Superblocks (2x2 region of MBs) */
-static void decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc,
-                          int mb_row, MACROBLOCKD *xd,
-                          BOOL_DECODER* const bc) {
-  int mb_col;
+static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
+                           vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD *const xd = &pbi->mb;
 
-  // For a SB there are 2 left contexts, each pertaining to a MB row within
-  vpx_memset(pc->left_context, 0, sizeof(pc->left_context));
+  if (bsize < BLOCK_SIZE_SB8X8)
+    if (xd->ab_index > 0)
+      return;
+  set_offsets(pbi, bsize, mi_row, mi_col);
+  vp9_decode_mb_mode_mv(pbi, xd, mi_row, mi_col, r);
+  set_refs(pbi, mi_row, mi_col);
 
-  for (mb_col = pc->cur_tile_mb_col_start;
-       mb_col < pc->cur_tile_mb_col_end; mb_col += 4) {
-    if (vp9_read(bc, pc->sb64_coded)) {
-#ifdef DEC_DEBUG
-      dec_debug = (pc->current_video_frame == 11 && pc->show_frame &&
-                   mb_row == 8 && mb_col == 0);
-      if (dec_debug)
-        printf("Debug Decode SB64\n");
-#endif
-      set_offsets(pbi, 64, mb_row, mb_col);
-      vp9_decode_mb_mode_mv(pbi, xd, mb_row, mb_col, bc);
-      set_refs(pbi, 64, mb_row, mb_col);
-      decode_superblock64(pbi, xd, mb_row, mb_col, bc);
-      xd->corrupted |= bool_error(bc);
-    } else {
-      int j;
+  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
+    decode_sb_intra(pbi, xd, mi_row, mi_col, r, (bsize < BLOCK_SIZE_SB8X8) ?
+                                     BLOCK_SIZE_SB8X8 : bsize);
+  else if (bsize < BLOCK_SIZE_SB8X8)
+    decode_atom(pbi, xd, mi_row, mi_col, r, BLOCK_SIZE_SB8X8);
+  else
+    decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
 
-      for (j = 0; j < 4; j++) {
-        const int x_idx_sb = (j & 1) << 1, y_idx_sb = j & 2;
+  xd->corrupted |= vp9_reader_has_error(r);
+}
 
-        if (mb_row + y_idx_sb >= pc->mb_rows ||
-            mb_col + x_idx_sb >= pc->mb_cols) {
-          // MB lies outside frame, skip on to next
-          continue;
-        }
+static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
+                            vp9_reader* r, BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON *const pc = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  int bsl = mi_width_log2(bsize), bs = (1 << bsl) / 2;
+  int n;
+  PARTITION_TYPE partition = PARTITION_NONE;
+  BLOCK_SIZE_TYPE subsize;
 
-        xd->sb_index = j;
+  if (mi_row >= pc->mi_rows || mi_col >= pc->mi_cols)
+    return;
 
-        if (vp9_read(bc, pc->sb32_coded)) {
-#ifdef DEC_DEBUG
-          dec_debug = (pc->current_video_frame == 11 && pc->show_frame &&
-                       mb_row + y_idx_sb == 8 && mb_col + x_idx_sb == 0);
-          if (dec_debug)
-            printf("Debug Decode SB32\n");
-#endif
-          set_offsets(pbi, 32, mb_row + y_idx_sb, mb_col + x_idx_sb);
-          vp9_decode_mb_mode_mv(pbi,
-                                xd, mb_row + y_idx_sb, mb_col + x_idx_sb, bc);
-          set_refs(pbi, 32, mb_row + y_idx_sb, mb_col + x_idx_sb);
-          decode_superblock32(pbi,
-                              xd, mb_row + y_idx_sb, mb_col + x_idx_sb, bc);
-          xd->corrupted |= bool_error(bc);
-        } else {
-          int i;
+  if (bsize < BLOCK_SIZE_SB8X8)
+    if (xd->ab_index != 0)
+      return;
 
-          // Process the 4 MBs within the SB in the order:
-          // top-left, top-right, bottom-left, bottom-right
-          for (i = 0; i < 4; i++) {
-            const int x_idx = x_idx_sb + (i & 1), y_idx = y_idx_sb + (i >> 1);
+  if (bsize >= BLOCK_SIZE_SB8X8) {
+    int pl;
+    int idx = check_bsize_coverage(pc, xd, mi_row, mi_col, bsize);
+    // read the partition information
+    xd->left_seg_context = pc->left_seg_context + (mi_row & MI_MASK);
+    xd->above_seg_context = pc->above_seg_context + mi_col;
+    pl = partition_plane_context(xd, bsize);
 
-            if (mb_row + y_idx >= pc->mb_rows ||
-                mb_col + x_idx >= pc->mb_cols) {
-              // MB lies outside frame, skip on to next
-              continue;
-            }
-#ifdef DEC_DEBUG
-            dec_debug = (pc->current_video_frame == 11 && pc->show_frame &&
-                         mb_row + y_idx == 8 && mb_col + x_idx == 0);
-            if (dec_debug)
-              printf("Debug Decode MB\n");
-#endif
+    if (idx == 0)
+      partition = treed_read(r, vp9_partition_tree,
+                             pc->fc.partition_prob[pc->frame_type][pl]);
+    else if (idx > 0 &&
+        !vp9_read(r, pc->fc.partition_prob[pc->frame_type][pl][idx]))
+      partition = (idx == 1) ? PARTITION_HORZ : PARTITION_VERT;
+    else
+      partition = PARTITION_SPLIT;
 
-            set_offsets(pbi, 16, mb_row + y_idx, mb_col + x_idx);
-            xd->mb_index = i;
-            vp9_decode_mb_mode_mv(pbi, xd, mb_row + y_idx, mb_col + x_idx, bc);
-            set_refs(pbi, 16, mb_row + y_idx, mb_col + x_idx);
-            decode_macroblock(pbi, xd, mb_row + y_idx, mb_col + x_idx, bc);
+    pc->fc.partition_counts[pl][partition]++;
+  }
 
-            /* check if the boolean decoder has suffered an error */
-            xd->corrupted |= bool_error(bc);
-          }
-        }
+  subsize = get_subsize(bsize, partition);
+  *(get_sb_index(xd, subsize)) = 0;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      decode_modes_b(pbi, mi_row, mi_col, r, subsize);
+      break;
+    case PARTITION_HORZ:
+      decode_modes_b(pbi, mi_row, mi_col, r, subsize);
+      *(get_sb_index(xd, subsize)) = 1;
+      if (mi_row + bs < pc->mi_rows)
+        decode_modes_b(pbi, mi_row + bs, mi_col, r, subsize);
+      break;
+    case PARTITION_VERT:
+      decode_modes_b(pbi, mi_row, mi_col, r, subsize);
+      *(get_sb_index(xd, subsize)) = 1;
+      if (mi_col + bs < pc->mi_cols)
+        decode_modes_b(pbi, mi_row, mi_col + bs, r, subsize);
+      break;
+    case PARTITION_SPLIT:
+      for (n = 0; n < 4; n++) {
+        int j = n >> 1, i = n & 0x01;
+        *(get_sb_index(xd, subsize)) = n;
+        decode_modes_sb(pbi, mi_row + j * bs, mi_col + i * bs, r, subsize);
       }
-    }
+      break;
+    default:
+      assert(0);
   }
+  // update partition context
+  if (bsize >= BLOCK_SIZE_SB8X8 &&
+      (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {
+    set_partition_seg_context(pc, xd, mi_row, mi_col);
+    update_partition_context(xd, subsize, bsize);
+  }
 }
 
-
 static void setup_token_decoder(VP9D_COMP *pbi,
-                                const unsigned char *cx_data,
-                                BOOL_DECODER* const bool_decoder) {
+                                const uint8_t *data, size_t read_size,
+                                vp9_reader *r) {
   VP9_COMMON *pc = &pbi->common;
-  const unsigned char *user_data_end = pbi->Source + pbi->source_sz;
-  const unsigned char *partition = cx_data;
-  ptrdiff_t bytes_left = user_data_end - partition;
-  ptrdiff_t partition_size = bytes_left;
+  const uint8_t *data_end = pbi->source + pbi->source_sz;
 
   // Validate the calculated partition length. If the buffer
   // described by the partition can't be fully read, then restrict
   // it to the portion that can be (for EC mode) or throw an error.
-  if (!read_is_valid(partition, partition_size, user_data_end)) {
+  if (!read_is_valid(data, read_size, data_end))
     vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
-                       "Truncated packet or corrupt partition "
-                       "%d length", 1);
-  }
+                       "Truncated packet or corrupt tile length");
 
-  if (vp9_start_decode(bool_decoder,
-                       partition, (unsigned int)partition_size))
+  if (vp9_reader_init(r, data, read_size))
     vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder %d", 1);
 }
 
-static void init_frame(VP9D_COMP *pbi) {
-  VP9_COMMON *const pc = &pbi->common;
-  MACROBLOCKD *const xd = &pbi->mb;
+static void read_coef_probs_common(FRAME_CONTEXT *fc, TX_SIZE tx_size,
+                                   vp9_reader *r) {
+  const int entropy_nodes_update = UNCONSTRAINED_NODES;
+  vp9_coeff_probs_model *coef_probs = fc->coef_probs[tx_size];
 
-  if (pc->frame_type == KEY_FRAME) {
-    vp9_setup_past_independence(pc, xd);
-    // All buffers are implicitly updated on key frames.
-    pbi->refresh_frame_flags = (1 << NUM_REF_FRAMES) - 1;
-  } else if (pc->error_resilient_mode) {
-    vp9_setup_past_independence(pc, xd);
-  }
-
-  if (pc->frame_type != KEY_FRAME) {
-    pc->mcomp_filter_type = pc->use_bilinear_mc_filter ? BILINEAR : EIGHTTAP;
-
-    // To enable choice of different interpolation filters
-    vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
-  }
-
-  xd->mode_info_context = pc->mi;
-  xd->prev_mode_info_context = pc->prev_mi;
-  xd->frame_type = pc->frame_type;
-  xd->mode_info_context->mbmi.mode = DC_PRED;
-  xd->mode_info_stride = pc->mode_info_stride;
-  xd->corrupted = 0;
-  xd->fullpixel_mask = pc->full_pixel ? 0xfffffff8 : 0xffffffff;
-}
-
-#if CONFIG_CODE_NONZEROCOUNT
-static void read_nzc_probs_common(VP9_COMMON *cm,
-                                  BOOL_DECODER* const bc,
-                                  int block_size) {
-  int c, r, b, t;
-  int tokens, nodes;
-  vp9_prob *nzc_probs;
-  vp9_prob upd;
-
-  if (!vp9_read_bit(bc)) return;
-
-  if (block_size == 32) {
-    tokens = NZC32X32_TOKENS;
-    nzc_probs = cm->fc.nzc_probs_32x32[0][0][0];
-    upd = NZC_UPDATE_PROB_32X32;
-  } else if (block_size == 16) {
-    tokens = NZC16X16_TOKENS;
-    nzc_probs = cm->fc.nzc_probs_16x16[0][0][0];
-    upd = NZC_UPDATE_PROB_16X16;
-  } else if (block_size == 8) {
-    tokens = NZC8X8_TOKENS;
-    nzc_probs = cm->fc.nzc_probs_8x8[0][0][0];
-    upd = NZC_UPDATE_PROB_8X8;
-  } else {
-    tokens = NZC4X4_TOKENS;
-    nzc_probs = cm->fc.nzc_probs_4x4[0][0][0];
-    upd = NZC_UPDATE_PROB_4X4;
-  }
-  nodes = tokens - 1;
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (r = 0; r < REF_TYPES; ++r) {
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;
-        int offset_nodes = offset * nodes;
-        for (t = 0; t < nodes; ++t) {
-          vp9_prob *p = &nzc_probs[offset_nodes + t];
-          if (vp9_read(bc, upd)) {
-            *p = read_prob_diff_update(bc, *p);
-          }
-        }
-      }
-    }
-  }
-}
-
-static void read_nzc_pcat_probs(VP9_COMMON *cm, BOOL_DECODER* const bc) {
-  int c, t, b;
-  vp9_prob upd = NZC_UPDATE_PROB_PCAT;
-  if (!vp9_read_bit(bc)) {
-    return;
-  }
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
-      int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];
-      for (b = 0; b < bits; ++b) {
-        vp9_prob *p = &cm->fc.nzc_pcat_probs[c][t][b];
-        if (vp9_read(bc, upd)) {
-          *p = read_prob_diff_update(bc, *p);
-        }
-      }
-    }
-  }
-}
-
-static void read_nzc_probs(VP9_COMMON *cm,
-                           BOOL_DECODER* const bc) {
-  read_nzc_probs_common(cm, bc, 4);
-  if (cm->txfm_mode != ONLY_4X4)
-    read_nzc_probs_common(cm, bc, 8);
-  if (cm->txfm_mode > ALLOW_8X8)
-    read_nzc_probs_common(cm, bc, 16);
-  if (cm->txfm_mode > ALLOW_16X16)
-    read_nzc_probs_common(cm, bc, 32);
-#ifdef NZC_PCAT_UPDATE
-  read_nzc_pcat_probs(cm, bc);
-#endif
-}
-#endif  // CONFIG_CODE_NONZEROCOUNT
-
-static void read_coef_probs_common(BOOL_DECODER* const bc,
-                                   vp9_coeff_probs *coef_probs,
-                                   int block_types) {
-#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE
-  const int entropy_nodes_update = UNCONSTRAINED_UPDATE_NODES;
-#else
-  const int entropy_nodes_update = ENTROPY_NODES;
-#endif
-
   int i, j, k, l, m;
 
-  if (vp9_read_bit(bc)) {
-    for (i = 0; i < block_types; i++) {
+  if (vp9_read_bit(r)) {
+    for (i = 0; i < BLOCK_TYPES; i++) {
       for (j = 0; j < REF_TYPES; j++) {
         for (k = 0; k < COEF_BANDS; k++) {
           for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
+            const int mstart = 0;
             if (l >= 3 && k == 0)
               continue;
-            for (m = CONFIG_CODE_NONZEROCOUNT; m < entropy_nodes_update; m++) {
+
+            for (m = mstart; m < entropy_nodes_update; m++) {
               vp9_prob *const p = coef_probs[i][j][k][l] + m;
 
-              if (vp9_read(bc, vp9_coef_update_prob[m])) {
-                *p = read_prob_diff_update(bc, *p);
-#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE
-                if (m == UNCONSTRAINED_NODES - 1)
-                  vp9_get_model_distribution(*p, coef_probs[i][j][k][l], i, j);
-#endif
+              if (vp9_read(r, vp9_coef_update_prob[m])) {
+                *p = vp9_read_prob_diff_update(r, *p);
               }
             }
           }
@@ -1242,159 +586,104 @@
   }
 }
 
-static void read_coef_probs(VP9D_COMP *pbi, BOOL_DECODER* const bc) {
-  VP9_COMMON *const pc = &pbi->common;
+static void read_coef_probs(VP9D_COMP *pbi, vp9_reader *r) {
+  const TXFM_MODE txfm_mode = pbi->common.txfm_mode;
+  FRAME_CONTEXT *const fc = &pbi->common.fc;
 
-  read_coef_probs_common(bc, pc->fc.coef_probs_4x4, BLOCK_TYPES);
+  read_coef_probs_common(fc, TX_4X4, r);
 
-  if (pbi->common.txfm_mode != ONLY_4X4)
-    read_coef_probs_common(bc, pc->fc.coef_probs_8x8, BLOCK_TYPES);
+  if (txfm_mode > ONLY_4X4)
+    read_coef_probs_common(fc, TX_8X8, r);
 
-  if (pbi->common.txfm_mode > ALLOW_8X8)
-    read_coef_probs_common(bc, pc->fc.coef_probs_16x16, BLOCK_TYPES);
+  if (txfm_mode > ALLOW_8X8)
+    read_coef_probs_common(fc, TX_16X16, r);
 
-  if (pbi->common.txfm_mode > ALLOW_16X16)
-    read_coef_probs_common(bc, pc->fc.coef_probs_32x32, BLOCK_TYPES);
+  if (txfm_mode > ALLOW_16X16)
+    read_coef_probs_common(fc, TX_32X32, r);
 }
 
-static void update_frame_size(VP9D_COMP *pbi) {
-  VP9_COMMON *cm = &pbi->common;
-
-  /* our internal buffers are always multiples of 16 */
-  const int width = (cm->width + 15) & ~15;
-  const int height = (cm->height + 15) & ~15;
-
-  cm->mb_rows = height >> 4;
-  cm->mb_cols = width >> 4;
-  cm->MBs = cm->mb_rows * cm->mb_cols;
-  cm->mode_info_stride = cm->mb_cols + 1;
-  memset(cm->mip, 0,
-        (cm->mb_cols + 1) * (cm->mb_rows + 1) * sizeof(MODE_INFO));
-  vp9_update_mode_info_border(cm, cm->mip);
-
-  cm->mi = cm->mip + cm->mode_info_stride + 1;
-  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
-  vp9_update_mode_info_in_image(cm, cm->mi);
-}
-
-static void setup_segmentation(VP9_COMMON *pc, MACROBLOCKD *xd, vp9_reader *r) {
+static void setup_segmentation(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
   int i, j;
 
-  xd->segmentation_enabled = vp9_read_bit(r);
-  if (xd->segmentation_enabled) {
-    // Read whether or not the segmentation map is being explicitly updated
-    // this frame.
-    xd->update_mb_segmentation_map = vp9_read_bit(r);
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
 
-    // If so what method will be used.
-    if (xd->update_mb_segmentation_map) {
-      // Which macro block level features are enabled. Read the probs used to
-      // decode the segment id for each macro block.
-      for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
-        xd->mb_segment_tree_probs[i] = vp9_read_bit(r) ? vp9_read_prob(r) : 255;
-      }
+  xd->update_mb_segmentation_map = 0;
+  xd->update_mb_segmentation_data = 0;
 
-      // Read the prediction probs needed to decode the segment id
-      pc->temporal_update = vp9_read_bit(r);
-      for (i = 0; i < PREDICTION_PROBS; i++) {
-        pc->segment_pred_probs[i] = pc->temporal_update
-            ? (vp9_read_bit(r) ? vp9_read_prob(r) : 255)
-            : 255;
-      }
+  xd->segmentation_enabled = vp9_rb_read_bit(rb);
+  if (!xd->segmentation_enabled)
+    return;
 
-      if (pc->temporal_update) {
-        const vp9_prob *p = xd->mb_segment_tree_probs;
-        vp9_prob *p_mod = xd->mb_segment_mispred_tree_probs;
-        const int c0 =        p[0]  *        p[1];
-        const int c1 =        p[0]  * (256 - p[1]);
-        const int c2 = (256 - p[0]) *        p[2];
-        const int c3 = (256 - p[0]) * (256 - p[2]);
+  // Segmentation map update
+  xd->update_mb_segmentation_map = vp9_rb_read_bit(rb);
+  if (xd->update_mb_segmentation_map) {
+    for (i = 0; i < MB_SEG_TREE_PROBS; i++)
+      xd->mb_segment_tree_probs[i] = vp9_rb_read_bit(rb) ?
+                                         vp9_rb_read_literal(rb, 8) : MAX_PROB;
 
-        p_mod[0] = get_binary_prob(c1, c2 + c3);
-        p_mod[1] = get_binary_prob(c0, c2 + c3);
-        p_mod[2] = get_binary_prob(c0 + c1, c3);
-        p_mod[3] = get_binary_prob(c0 + c1, c2);
-      }
+    cm->temporal_update = vp9_rb_read_bit(rb);
+    if (cm->temporal_update) {
+      for (i = 0; i < PREDICTION_PROBS; i++)
+        cm->segment_pred_probs[i] = vp9_rb_read_bit(rb) ?
+                                        vp9_rb_read_literal(rb, 8) : MAX_PROB;
+    } else {
+      for (i = 0; i < PREDICTION_PROBS; i++)
+        cm->segment_pred_probs[i] = MAX_PROB;
     }
+  }
 
-    xd->update_mb_segmentation_data = vp9_read_bit(r);
-    if (xd->update_mb_segmentation_data) {
-      int data;
+  // Segmentation data update
+  xd->update_mb_segmentation_data = vp9_rb_read_bit(rb);
+  if (xd->update_mb_segmentation_data) {
+    xd->mb_segment_abs_delta = vp9_rb_read_bit(rb);
 
-      xd->mb_segment_abs_delta = vp9_read_bit(r);
+    vp9_clearall_segfeatures(xd);
 
-      vp9_clearall_segfeatures(xd);
-
-      // For each segmentation...
-      for (i = 0; i < MAX_MB_SEGMENTS; i++) {
-        // For each of the segments features...
-        for (j = 0; j < SEG_LVL_MAX; j++) {
-          // Is the feature enabled
-          if (vp9_read_bit(r)) {
-            // Update the feature data and mask
-            vp9_enable_segfeature(xd, i, j);
-
-            data = vp9_decode_unsigned_max(r, vp9_seg_feature_data_max(j));
-
-            // Is the segment data signed..
-            if (vp9_is_segfeature_signed(j)) {
-              if (vp9_read_bit(r))
-                data = -data;
-            }
-          } else {
-            data = 0;
-          }
-
-          vp9_set_segdata(xd, i, j, data);
+    for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+      for (j = 0; j < SEG_LVL_MAX; j++) {
+        int data = 0;
+        const int feature_enabled = vp9_rb_read_bit(rb);
+        if (feature_enabled) {
+          vp9_enable_segfeature(xd, i, j);
+          data = decode_unsigned_max(rb, vp9_seg_feature_data_max(j));
+          if (vp9_is_segfeature_signed(j))
+            data = vp9_rb_read_bit(rb) ? -data : data;
         }
+        vp9_set_segdata(xd, i, j, data);
       }
     }
   }
 }
 
-static void setup_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd, vp9_reader *r) {
-  int i;
+static void setup_loopfilter(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
 
-  pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(r);
-  pc->filter_level = vp9_read_literal(r, 6);
-  pc->sharpness_level = vp9_read_literal(r, 3);
+  cm->filter_level = vp9_rb_read_literal(rb, 6);
+  cm->sharpness_level = vp9_rb_read_literal(rb, 3);
 
-#if CONFIG_LOOP_DERING
-  if (vp9_read_bit(r))
-    pc->dering_enabled = 1 + vp9_read_literal(r, 4);
-  else
-    pc->dering_enabled = 0;
-#endif
-
   // Read in loop filter deltas applied at the MB level based on mode or ref
   // frame.
   xd->mode_ref_lf_delta_update = 0;
-  xd->mode_ref_lf_delta_enabled = vp9_read_bit(r);
 
+  xd->mode_ref_lf_delta_enabled = vp9_rb_read_bit(rb);
   if (xd->mode_ref_lf_delta_enabled) {
-    // Do the deltas need to be updated
-    xd->mode_ref_lf_delta_update = vp9_read_bit(r);
-
+    xd->mode_ref_lf_delta_update = vp9_rb_read_bit(rb);
     if (xd->mode_ref_lf_delta_update) {
-      // Send update
-      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
-        if (vp9_read_bit(r)) {
-          // sign = vp9_read_bit(r);
-          xd->ref_lf_deltas[i] = vp9_read_literal(r, 6);
+      int i;
 
-          if (vp9_read_bit(r))
-            xd->ref_lf_deltas[i] = -xd->ref_lf_deltas[i];  // Apply sign
+      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
+        if (vp9_rb_read_bit(rb)) {
+          const int value = vp9_rb_read_literal(rb, 6);
+          xd->ref_lf_deltas[i] = vp9_rb_read_bit(rb) ? -value : value;
         }
       }
 
-      // Send update
       for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
-        if (vp9_read_bit(r)) {
-          // sign = vp9_read_bit(r);
-          xd->mode_lf_deltas[i] = vp9_read_literal(r, 6);
-
-          if (vp9_read_bit(r))
-            xd->mode_lf_deltas[i] = -xd->mode_lf_deltas[i];  // Apply sign
+        if (vp9_rb_read_bit(rb)) {
+          const int value = vp9_rb_read_literal(rb, 6);
+          xd->mode_lf_deltas[i] = vp9_rb_read_bit(rb) ? -value : value;
         }
       }
     }
@@ -1401,168 +690,234 @@
   }
 }
 
-static const uint8_t *setup_frame_size(VP9D_COMP *pbi, int scaling_active,
-                                      const uint8_t *data,
-                                      const uint8_t *data_end) {
-  VP9_COMMON *const pc = &pbi->common;
-  const int width = pc->width;
-  const int height = pc->height;
-
-  // If error concealment is enabled we should only parse the new size
-  // if we have enough data. Otherwise we will end up with the wrong size.
-  if (scaling_active && data + 4 < data_end) {
-    pc->display_width = read_le16(data + 0);
-    pc->display_height = read_le16(data + 2);
-    data += 4;
+static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) {
+  const int old = *delta_q;
+  if (vp9_rb_read_bit(rb)) {
+    const int value = vp9_rb_read_literal(rb, 4);
+    *delta_q = vp9_rb_read_bit(rb) ? -value : value;
   }
+  return old != *delta_q;
+}
 
-  if (data + 4 < data_end) {
-    pc->width = read_le16(data + 0);
-    pc->height = read_le16(data + 2);
-    data += 4;
-  }
+static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
+  MACROBLOCKD *const xd = &pbi->mb;
+  VP9_COMMON *const cm = &pbi->common;
+  int update = 0;
 
-  if (!scaling_active) {
-    pc->display_width = pc->width;
-    pc->display_height = pc->height;
+  cm->base_qindex = vp9_rb_read_literal(rb, QINDEX_BITS);
+  update |= read_delta_q(rb, &cm->y_dc_delta_q);
+  update |= read_delta_q(rb, &cm->uv_dc_delta_q);
+  update |= read_delta_q(rb, &cm->uv_ac_delta_q);
+  if (update)
+    vp9_init_dequantizer(cm);
+
+  xd->lossless = cm->base_qindex == 0 &&
+                 cm->y_dc_delta_q == 0 &&
+                 cm->uv_dc_delta_q == 0 &&
+                 cm->uv_ac_delta_q == 0;
+  if (xd->lossless) {
+    xd->itxm_add          = vp9_idct_add_lossless_c;
+    xd->itxm_add_y_block  = vp9_idct_add_y_block_lossless_c;
+    xd->itxm_add_uv_block = vp9_idct_add_uv_block_lossless_c;
+  } else {
+    xd->itxm_add          = vp9_idct_add;
+    xd->itxm_add_y_block  = vp9_idct_add_y_block;
+    xd->itxm_add_uv_block = vp9_idct_add_uv_block;
   }
+}
 
-  if (width != pc->width || height != pc->height) {
-    if (pc->width <= 0) {
-      pc->width = width;
-      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Invalid frame width");
-    }
+static INTERPOLATIONFILTERTYPE read_interp_filter_type(
+    struct vp9_read_bit_buffer *rb) {
+  return vp9_rb_read_bit(rb) ? SWITCHABLE
+                             : vp9_rb_read_literal(rb, 2);
+}
 
-    if (pc->height <= 0) {
-      pc->height = height;
-      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Invalid frame height");
-    }
+static void read_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb,
+                            int *width, int *height) {
+  const int w = vp9_rb_read_literal(rb, 16) + 1;
+  const int h = vp9_rb_read_literal(rb, 16) + 1;
+  *width = w;
+  *height = h;
+}
 
+static void setup_display_size(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
+  VP9_COMMON *const cm = &pbi->common;
+  cm->display_width = cm->width;
+  cm->display_height = cm->height;
+  if (vp9_rb_read_bit(rb))
+    read_frame_size(cm, rb, &cm->display_width, &cm->display_height);
+}
+
+static void apply_frame_size(VP9D_COMP *pbi, int width, int height) {
+  VP9_COMMON *cm = &pbi->common;
+
+  if (cm->width != width || cm->height != height) {
     if (!pbi->initial_width || !pbi->initial_height) {
-      if (vp9_alloc_frame_buffers(pc, pc->width, pc->height))
-        vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+      if (vp9_alloc_frame_buffers(cm, width, height))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate frame buffers");
-      pbi->initial_width = pc->width;
-      pbi->initial_height = pc->height;
-    }
+      pbi->initial_width = width;
+      pbi->initial_height = height;
+    } else {
+      if (width > pbi->initial_width)
+        vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                           "Frame width too large");
 
-    if (pc->width > pbi->initial_width) {
-      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Frame width too large");
+      if (height > pbi->initial_height)
+        vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                           "Frame height too large");
     }
 
-    if (pc->height > pbi->initial_height) {
-      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Frame height too large");
-    }
+    cm->width = width;
+    cm->height = height;
 
-    update_frame_size(pbi);
+    vp9_update_frame_size(cm);
   }
 
-  return data;
+  vp9_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx], cm->width, cm->height,
+                           cm->subsampling_x, cm->subsampling_y,
+                           VP9BORDERINPIXELS);
 }
 
-static void update_frame_context(VP9D_COMP *pbi, vp9_reader *r) {
-  FRAME_CONTEXT *const fc = &pbi->common.fc;
+static void setup_frame_size(VP9D_COMP *pbi,
+                             struct vp9_read_bit_buffer *rb) {
+  VP9_COMMON *const cm = &pbi->common;
+  int width, height;
+  read_frame_size(cm, rb, &width, &height);
+  setup_display_size(pbi, rb);
+  apply_frame_size(pbi, width, height);
+}
 
-  vp9_copy(fc->pre_coef_probs_4x4, fc->coef_probs_4x4);
-  vp9_copy(fc->pre_coef_probs_8x8, fc->coef_probs_8x8);
-  vp9_copy(fc->pre_coef_probs_16x16, fc->coef_probs_16x16);
-  vp9_copy(fc->pre_coef_probs_32x32, fc->coef_probs_32x32);
-  vp9_copy(fc->pre_ymode_prob, fc->ymode_prob);
-  vp9_copy(fc->pre_sb_ymode_prob, fc->sb_ymode_prob);
-  vp9_copy(fc->pre_uv_mode_prob, fc->uv_mode_prob);
-  vp9_copy(fc->pre_bmode_prob, fc->bmode_prob);
-  vp9_copy(fc->pre_i8x8_mode_prob, fc->i8x8_mode_prob);
-  vp9_copy(fc->pre_sub_mv_ref_prob, fc->sub_mv_ref_prob);
-  vp9_copy(fc->pre_mbsplit_prob, fc->mbsplit_prob);
+static void setup_frame_size_with_refs(VP9D_COMP *pbi,
+                                       struct vp9_read_bit_buffer *rb) {
+  VP9_COMMON *const cm = &pbi->common;
+
+  int width, height;
+  int found = 0, i;
+  for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
+    if (vp9_rb_read_bit(rb)) {
+      YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->active_ref_idx[i]];
+      width = cfg->y_crop_width;
+      height = cfg->y_crop_height;
+      found = 1;
+      break;
+    }
+  }
+
+  if (!found)
+    read_frame_size(cm, rb, &width, &height);
+
+  if (!width || !height)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Referenced frame with invalid size");
+
+  setup_display_size(pbi, rb);
+  apply_frame_size(pbi, width, height);
+}
+
+static void update_frame_context(FRAME_CONTEXT *fc) {
+  vp9_copy(fc->pre_coef_probs, fc->coef_probs);
+  vp9_copy(fc->pre_y_mode_prob, fc->y_mode_prob);
+  vp9_copy(fc->pre_uv_mode_prob, fc->uv_mode_prob);
+  vp9_copy(fc->pre_partition_prob, fc->partition_prob[1]);
+  vp9_copy(fc->pre_intra_inter_prob, fc->intra_inter_prob);
+  vp9_copy(fc->pre_comp_inter_prob, fc->comp_inter_prob);
+  vp9_copy(fc->pre_single_ref_prob, fc->single_ref_prob);
+  vp9_copy(fc->pre_comp_ref_prob, fc->comp_ref_prob);
   fc->pre_nmvc = fc->nmvc;
+  vp9_copy(fc->pre_switchable_interp_prob, fc->switchable_interp_prob);
+  vp9_copy(fc->pre_inter_mode_probs, fc->inter_mode_probs);
+  vp9_copy(fc->pre_tx_probs_8x8p, fc->tx_probs_8x8p);
+  vp9_copy(fc->pre_tx_probs_16x16p, fc->tx_probs_16x16p);
+  vp9_copy(fc->pre_tx_probs_32x32p, fc->tx_probs_32x32p);
+  vp9_copy(fc->pre_mbskip_probs, fc->mbskip_probs);
 
-  vp9_zero(fc->coef_counts_4x4);
-  vp9_zero(fc->coef_counts_8x8);
-  vp9_zero(fc->coef_counts_16x16);
-  vp9_zero(fc->coef_counts_32x32);
+  vp9_zero(fc->coef_counts);
   vp9_zero(fc->eob_branch_counts);
-  vp9_zero(fc->ymode_counts);
-  vp9_zero(fc->sb_ymode_counts);
+  vp9_zero(fc->y_mode_counts);
   vp9_zero(fc->uv_mode_counts);
-  vp9_zero(fc->bmode_counts);
-  vp9_zero(fc->i8x8_mode_counts);
-  vp9_zero(fc->sub_mv_ref_counts);
-  vp9_zero(fc->mbsplit_counts);
   vp9_zero(fc->NMVcount);
-  vp9_zero(fc->mv_ref_ct);
+  vp9_zero(fc->inter_mode_counts);
+  vp9_zero(fc->partition_counts);
+  vp9_zero(fc->switchable_interp_count);
+  vp9_zero(fc->intra_inter_count);
+  vp9_zero(fc->comp_inter_count);
+  vp9_zero(fc->single_ref_count);
+  vp9_zero(fc->comp_ref_count);
+  vp9_zero(fc->tx_count_8x8p);
+  vp9_zero(fc->tx_count_16x16p);
+  vp9_zero(fc->tx_count_32x32p);
+  vp9_zero(fc->mbskip_count);
+}
 
-#if CONFIG_COMP_INTERINTRA_PRED
-  fc->pre_interintra_prob = fc->interintra_prob;
-  vp9_zero(fc->interintra_counts);
-#endif
+static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) {
+  VP9_COMMON *const pc = &pbi->common;
+  int mi_row, mi_col;
 
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_copy(fc->pre_nzc_probs_4x4, fc->nzc_probs_4x4);
-  vp9_copy(fc->pre_nzc_probs_8x8, fc->nzc_probs_8x8);
-  vp9_copy(fc->pre_nzc_probs_16x16, fc->nzc_probs_16x16);
-  vp9_copy(fc->pre_nzc_probs_32x32, fc->nzc_probs_32x32);
-  vp9_copy(fc->pre_nzc_pcat_probs, fc->nzc_pcat_probs);
+  for (mi_row = pc->cur_tile_mi_row_start;
+       mi_row < pc->cur_tile_mi_row_end; mi_row += 64 / MI_SIZE) {
+    // For a SB there are 2 left contexts, each pertaining to a MB row within
+    vpx_memset(&pc->left_context, 0, sizeof(pc->left_context));
+    vpx_memset(pc->left_seg_context, 0, sizeof(pc->left_seg_context));
+    for (mi_col = pc->cur_tile_mi_col_start;
+         mi_col < pc->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE)
+      decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_SIZE_SB64X64);
+  }
+}
 
-  vp9_zero(fc->nzc_counts_4x4);
-  vp9_zero(fc->nzc_counts_8x8);
-  vp9_zero(fc->nzc_counts_16x16);
-  vp9_zero(fc->nzc_counts_32x32);
-  vp9_zero(fc->nzc_pcat_counts);
-#endif
+static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
+  int delta_log2_tiles;
 
-  read_coef_probs(pbi, r);
-#if CONFIG_CODE_NONZEROCOUNT
-  read_nzc_probs(&pbi->common, r);
-#endif
+  vp9_get_tile_n_bits(cm, &cm->log2_tile_columns, &delta_log2_tiles);
+  while (delta_log2_tiles--) {
+    if (vp9_rb_read_bit(rb)) {
+      cm->log2_tile_columns++;
+    } else {
+      break;
+    }
+  }
+
+  cm->log2_tile_rows = vp9_rb_read_bit(rb);
+  if (cm->log2_tile_rows)
+    cm->log2_tile_rows += vp9_rb_read_bit(rb);
+
+  cm->tile_columns = 1 << cm->log2_tile_columns;
+  cm->tile_rows    = 1 << cm->log2_tile_rows;
 }
 
 static void decode_tiles(VP9D_COMP *pbi,
-                         const uint8_t *data, int first_partition_size,
-                         BOOL_DECODER *header_bc, BOOL_DECODER *residual_bc) {
+                         const uint8_t *data, size_t first_partition_size,
+                         vp9_reader *residual_bc) {
   VP9_COMMON *const pc = &pbi->common;
-  MACROBLOCKD *const xd  = &pbi->mb;
 
   const uint8_t *data_ptr = data + first_partition_size;
-  int tile_row, tile_col, delta_log2_tiles;
-  int mb_row;
+  const uint8_t* const data_end = pbi->source + pbi->source_sz;
+  int tile_row, tile_col;
 
-  vp9_get_tile_n_bits(pc, &pc->log2_tile_columns, &delta_log2_tiles);
-  while (delta_log2_tiles--) {
-    if (vp9_read_bit(header_bc)) {
-      pc->log2_tile_columns++;
-    } else {
-      break;
-    }
-  }
-  pc->log2_tile_rows = vp9_read_bit(header_bc);
-  if (pc->log2_tile_rows)
-    pc->log2_tile_rows += vp9_read_bit(header_bc);
-  pc->tile_columns = 1 << pc->log2_tile_columns;
-  pc->tile_rows    = 1 << pc->log2_tile_rows;
+  // Note: this memset assumes above_context[0], [1] and [2]
+  // are allocated as part of the same buffer.
+  vpx_memset(pc->above_context[0], 0, sizeof(ENTROPY_CONTEXT) * 2 *
+                                      MAX_MB_PLANE * mi_cols_aligned_to_sb(pc));
 
-  vpx_memset(pc->above_context, 0,
-             sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
+  vpx_memset(pc->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
+                                       mi_cols_aligned_to_sb(pc));
 
   if (pbi->oxcf.inv_tile_order) {
     const int n_cols = pc->tile_columns;
     const uint8_t *data_ptr2[4][1 << 6];
-    BOOL_DECODER bc_bak = {0};
+    vp9_reader bc_bak = {0};
 
     // pre-initialize the offsets, we're going to read in inverse order
     data_ptr2[0][0] = data_ptr;
     for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
       if (tile_row) {
-        const int size = read_le32(data_ptr2[tile_row - 1][n_cols - 1]);
+        const int size = read_be32(data_ptr2[tile_row - 1][n_cols - 1]);
         data_ptr2[tile_row - 1][n_cols - 1] += 4;
         data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][n_cols - 1] + size;
       }
 
       for (tile_col = 1; tile_col < n_cols; tile_col++) {
-        const int size = read_le32(data_ptr2[tile_row][tile_col - 1]);
+        const int size = read_be32(data_ptr2[tile_row][tile_col - 1]);
         data_ptr2[tile_row][tile_col - 1] += 4;
         data_ptr2[tile_row][tile_col] =
             data_ptr2[tile_row][tile_col - 1] + size;
@@ -1573,14 +928,10 @@
       vp9_get_tile_row_offsets(pc, tile_row);
       for (tile_col = n_cols - 1; tile_col >= 0; tile_col--) {
         vp9_get_tile_col_offsets(pc, tile_col);
-        setup_token_decoder(pbi, data_ptr2[tile_row][tile_col], residual_bc);
-
-        // Decode a row of superblocks
-        for (mb_row = pc->cur_tile_mb_row_start;
-             mb_row < pc->cur_tile_mb_row_end; mb_row += 4) {
-          decode_sb_row(pbi, pc, mb_row, xd, residual_bc);
-        }
-
+        setup_token_decoder(pbi, data_ptr2[tile_row][tile_col],
+                            data_end - data_ptr2[tile_row][tile_col],
+                            residual_bc);
+        decode_tile(pbi, residual_bc);
         if (tile_row == pc->tile_rows - 1 && tile_col == n_cols - 1)
           bc_bak = *residual_bc;
       }
@@ -1592,333 +943,295 @@
     for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {
       vp9_get_tile_row_offsets(pc, tile_row);
       for (tile_col = 0; tile_col < pc->tile_columns; tile_col++) {
+        size_t size;
+
         vp9_get_tile_col_offsets(pc, tile_col);
 
         has_more = tile_col < pc->tile_columns - 1 ||
                    tile_row < pc->tile_rows - 1;
+        if (has_more) {
+          if (!read_is_valid(data_ptr, 4, data_end))
+            vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                         "Truncated packet or corrupt tile length");
 
-        // Setup decoder
-        setup_token_decoder(pbi, data_ptr + (has_more ? 4 : 0), residual_bc);
-
-        // Decode a row of superblocks
-        for (mb_row = pc->cur_tile_mb_row_start;
-             mb_row < pc->cur_tile_mb_row_end; mb_row += 4) {
-          decode_sb_row(pbi, pc, mb_row, xd, residual_bc);
+          size = read_be32(data_ptr);
+          data_ptr += 4;
+        } else {
+          size = data_end - data_ptr;
         }
 
-        if (has_more) {
-          const int size = read_le32(data_ptr);
-          data_ptr += 4 + size;
-        }
+        setup_token_decoder(pbi, data_ptr, size, residual_bc);
+        decode_tile(pbi, residual_bc);
+        data_ptr += size;
       }
     }
   }
 }
 
-int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
-  BOOL_DECODER header_bc, residual_bc;
-  VP9_COMMON *const pc = &pbi->common;
-  MACROBLOCKD *const xd  = &pbi->mb;
-  const uint8_t *data = (const uint8_t *)pbi->Source;
-  const uint8_t *data_end = data + pbi->source_sz;
-  ptrdiff_t first_partition_length_in_bytes = 0;
-  int i, corrupt_tokens = 0;
+static void check_sync_code(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
+  if (vp9_rb_read_literal(rb, 8) != SYNC_CODE_0 ||
+      vp9_rb_read_literal(rb, 8) != SYNC_CODE_1 ||
+      vp9_rb_read_literal(rb, 8) != SYNC_CODE_2) {
+    vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                       "Invalid frame sync code");
+  }
+}
 
-  // printf("Decoding frame %d\n", pc->current_video_frame);
+static void error_handler(void *data, int bit_offset) {
+  VP9_COMMON *const cm = (VP9_COMMON *)data;
+  vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");
+}
 
-  xd->corrupted = 0;  // start with no corruption of current frame
-  pc->yv12_fb[pc->new_fb_idx].corrupted = 0;
+static void setup_inter_inter(VP9_COMMON *cm) {
+  int i;
 
-  if (data_end - data < 3) {
-    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");
-  } else {
-    int scaling_active;
-    pc->last_frame_type = pc->frame_type;
-    pc->frame_type = (FRAME_TYPE)(data[0] & 1);
-    pc->version = (data[0] >> 1) & 7;
-    pc->show_frame = (data[0] >> 4) & 1;
-    scaling_active = (data[0] >> 5) & 1;
-    first_partition_length_in_bytes = read_le16(data + 1);
+  cm->allow_comp_inter_inter = 0;
+  for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
+    cm->allow_comp_inter_inter |= i > 0 &&
+        cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1];
+  }
 
-    if (!read_is_valid(data, first_partition_length_in_bytes, data_end))
-      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Truncated packet or corrupt partition 0 length");
+  if (cm->allow_comp_inter_inter) {
+    // which one is always-on in comp inter-inter?
+    if (cm->ref_frame_sign_bias[LAST_FRAME] ==
+        cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
+      cm->comp_fixed_ref = ALTREF_FRAME;
+      cm->comp_var_ref[0] = LAST_FRAME;
+      cm->comp_var_ref[1] = GOLDEN_FRAME;
+    } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==
+               cm->ref_frame_sign_bias[ALTREF_FRAME]) {
+      cm->comp_fixed_ref = GOLDEN_FRAME;
+      cm->comp_var_ref[0] = LAST_FRAME;
+      cm->comp_var_ref[1] = ALTREF_FRAME;
+    } else {
+      cm->comp_fixed_ref = LAST_FRAME;
+      cm->comp_var_ref[0] = GOLDEN_FRAME;
+      cm->comp_var_ref[1] = ALTREF_FRAME;
+    }
+  }
+}
 
-    data += 3;
+#define RESERVED \
+  if (vp9_rb_read_bit(rb)) \
+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, \
+                         "Reserved bit must be unset")
 
-    vp9_setup_version(pc);
+static size_t read_uncompressed_header(VP9D_COMP *pbi,
+                                       struct vp9_read_bit_buffer *rb) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  int i;
 
-    if (pc->frame_type == KEY_FRAME) {
-      // When error concealment is enabled we should only check the sync
-      // code if we have enough bits available
-      if (data + 3 < data_end) {
-        if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a)
-          vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
-                             "Invalid frame sync code");
-      }
-      data += 3;
-    }
+  cm->last_frame_type = cm->frame_type;
 
-    data = setup_frame_size(pbi, scaling_active, data, data_end);
-  }
+  if (vp9_rb_read_literal(rb, 2) != 0x2)
+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Invalid frame marker");
 
-  if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME) ||
-      pc->width == 0 || pc->height == 0) {
-    return -1;
+  cm->version = vp9_rb_read_bit(rb);
+  RESERVED;
+
+  if (vp9_rb_read_bit(rb)) {
+    // show an existing frame directly
+    int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];
+    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->new_fb_idx, frame_to_show);
+    pbi->refresh_frame_flags = 0;
+    cm->filter_level = 0;
+    return 0;
   }
 
-  init_frame(pbi);
+  cm->frame_type = (FRAME_TYPE) vp9_rb_read_bit(rb);
+  cm->show_frame = vp9_rb_read_bit(rb);
+  cm->error_resilient_mode = vp9_rb_read_bit(rb);
 
-  // Reset the frame pointers to the current frame size
-  vp8_yv12_realloc_frame_buffer(&pc->yv12_fb[pc->new_fb_idx],
-                                pc->width, pc->height,
-                                VP9BORDERINPIXELS);
+  if (cm->frame_type == KEY_FRAME) {
+    int csp;
 
-  if (vp9_start_decode(&header_bc, data,
-                       (unsigned int)first_partition_length_in_bytes))
-    vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate bool decoder 0");
+    check_sync_code(cm, rb);
 
-  pc->clr_type = (YUV_TYPE)vp9_read_bit(&header_bc);
-  pc->clamp_type = (CLAMP_TYPE)vp9_read_bit(&header_bc);
-  pc->error_resilient_mode = vp9_read_bit(&header_bc);
+    csp = vp9_rb_read_literal(rb, 3);  // colorspace
+    if (csp != 7) {  // != sRGB
+      vp9_rb_read_bit(rb);  // [16,235] (including xvycc) vs [0,255] range
+      if (cm->version == 1) {
+        cm->subsampling_x = vp9_rb_read_bit(rb);
+        cm->subsampling_y = vp9_rb_read_bit(rb);
+        vp9_rb_read_bit(rb);  // has extra plane
+      } else {
+        cm->subsampling_y = cm->subsampling_x = 1;
+      }
+    } else {
+      if (cm->version == 1) {
+        cm->subsampling_y = cm->subsampling_x = 0;
+        vp9_rb_read_bit(rb);  // has extra plane
+      } else {
+        vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                           "RGB not supported in profile 0");
+      }
+    }
 
-  setup_segmentation(pc, xd, &header_bc);
+    pbi->refresh_frame_flags = (1 << NUM_REF_FRAMES) - 1;
 
-  // Read common prediction model status flag probability updates for the
-  // reference frame
-  if (pc->frame_type == KEY_FRAME) {
-    // Set the prediction probabilities to defaults
-    pc->ref_pred_probs[0] = 120;
-    pc->ref_pred_probs[1] = 80;
-    pc->ref_pred_probs[2] = 40;
-  } else {
-    for (i = 0; i < PREDICTION_PROBS; i++) {
-      if (vp9_read_bit(&header_bc))
-        pc->ref_pred_probs[i] = vp9_read_prob(&header_bc);
-    }
-  }
+    for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
+      cm->active_ref_idx[i] = cm->new_fb_idx;
 
-  pc->sb64_coded = vp9_read_prob(&header_bc);
-  pc->sb32_coded = vp9_read_prob(&header_bc);
-  xd->lossless = vp9_read_bit(&header_bc);
-  if (xd->lossless) {
-    pc->txfm_mode = ONLY_4X4;
+    setup_frame_size(pbi, rb);
   } else {
-    // Read the loop filter level and type
-    pc->txfm_mode = vp9_read_literal(&header_bc, 2);
-    if (pc->txfm_mode == ALLOW_32X32)
-      pc->txfm_mode += vp9_read_bit(&header_bc);
+    cm->intra_only = cm->show_frame ? 0 : vp9_rb_read_bit(rb);
 
-    if (pc->txfm_mode == TX_MODE_SELECT) {
-      pc->prob_tx[0] = vp9_read_prob(&header_bc);
-      pc->prob_tx[1] = vp9_read_prob(&header_bc);
-      pc->prob_tx[2] = vp9_read_prob(&header_bc);
-    }
-  }
+    cm->reset_frame_context = cm->error_resilient_mode ?
+        0 : vp9_rb_read_literal(rb, 2);
 
-  setup_loopfilter(pc, xd, &header_bc);
+    if (cm->intra_only) {
+      check_sync_code(cm, rb);
 
-  // Dummy read for now
-  vp9_read_literal(&header_bc, 2);
+      pbi->refresh_frame_flags = vp9_rb_read_literal(rb, NUM_REF_FRAMES);
+      setup_frame_size(pbi, rb);
+    } else {
+       pbi->refresh_frame_flags = vp9_rb_read_literal(rb, NUM_REF_FRAMES);
 
-  /* Read the default quantizers. */
-  {
-    int q_update = 0;
-    pc->base_qindex = vp9_read_literal(&header_bc, QINDEX_BITS);
+      for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
+        const int ref = vp9_rb_read_literal(rb, NUM_REF_FRAMES_LG2);
+        cm->active_ref_idx[i] = cm->ref_frame_map[ref];
+        cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb);
+      }
 
-    /* AC 1st order Q = default */
-    pc->y1dc_delta_q = get_delta_q(&header_bc, pc->y1dc_delta_q, &q_update);
-    pc->uvdc_delta_q = get_delta_q(&header_bc, pc->uvdc_delta_q, &q_update);
-    pc->uvac_delta_q = get_delta_q(&header_bc, pc->uvac_delta_q, &q_update);
+      setup_frame_size_with_refs(pbi, rb);
 
-    if (q_update)
-      vp9_init_de_quantizer(pbi);
+      xd->allow_high_precision_mv = vp9_rb_read_bit(rb);
+      cm->mcomp_filter_type = read_interp_filter_type(rb);
 
-    /* MB level dequantizer setup */
-    mb_init_dequantizer(pbi, &pbi->mb);
+      for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
+        vp9_setup_scale_factors(cm, i);
+
+      setup_inter_inter(cm);
+    }
   }
 
-  // Determine if the golden frame or ARF buffer should be updated and how.
-  // For all non key frames the GF and ARF refresh flags and sign bias
-  // flags must be set explicitly.
-  if (pc->frame_type == KEY_FRAME) {
-    pc->active_ref_idx[0] = pc->new_fb_idx;
-    pc->active_ref_idx[1] = pc->new_fb_idx;
-    pc->active_ref_idx[2] = pc->new_fb_idx;
+  if (!cm->error_resilient_mode) {
+    cm->refresh_frame_context = vp9_rb_read_bit(rb);
+    cm->frame_parallel_decoding_mode = vp9_rb_read_bit(rb);
   } else {
-    // Should the GF or ARF be updated from the current frame
-    pbi->refresh_frame_flags = vp9_read_literal(&header_bc, NUM_REF_FRAMES);
+    cm->refresh_frame_context = 0;
+    cm->frame_parallel_decoding_mode = 1;
+  }
 
-    // Select active reference frames
-    for (i = 0; i < 3; i++) {
-      int ref_frame_num = vp9_read_literal(&header_bc, NUM_REF_FRAMES_LG2);
-      pc->active_ref_idx[i] = pc->ref_frame_map[ref_frame_num];
-    }
+  cm->frame_context_idx = vp9_rb_read_literal(rb, NUM_FRAME_CONTEXTS_LG2);
 
-    pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp9_read_bit(&header_bc);
-    pc->ref_frame_sign_bias[ALTREF_FRAME] = vp9_read_bit(&header_bc);
+  if ((cm->frame_type == KEY_FRAME) ||
+      cm->error_resilient_mode || cm->intra_only)
+    vp9_setup_past_independence(cm, xd);
 
-    // Is high precision mv allowed
-    xd->allow_high_precision_mv = vp9_read_bit(&header_bc);
+  setup_loopfilter(pbi, rb);
+  setup_quantization(pbi, rb);
+  setup_segmentation(pbi, rb);
 
-    // Read the type of subpel filter to use
-    pc->mcomp_filter_type = vp9_read_bit(&header_bc)
-                                ? SWITCHABLE
-                                : vp9_read_literal(&header_bc, 2);
+  setup_tile_info(cm, rb);
 
-#if CONFIG_COMP_INTERINTRA_PRED
-    pc->use_interintra = vp9_read_bit(&header_bc);
-#endif
-    // To enable choice of different interploation filters
-    vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
-  }
+  return vp9_rb_read_literal(rb, 16);
+}
 
-  if (!pc->error_resilient_mode) {
-    pc->refresh_entropy_probs = vp9_read_bit(&header_bc);
-    pc->frame_parallel_decoding_mode = vp9_read_bit(&header_bc);
-  } else {
-    pc->refresh_entropy_probs = 0;
-    pc->frame_parallel_decoding_mode = 1;
-  }
-  pc->frame_context_idx = vp9_read_literal(&header_bc, NUM_FRAME_CONTEXTS_LG2);
-  vpx_memcpy(&pc->fc, &pc->frame_contexts[pc->frame_context_idx],
-             sizeof(pc->fc));
+int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
+  int i;
+  vp9_reader header_bc, residual_bc;
+  VP9_COMMON *const pc = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
 
-  // Read inter mode probability context updates
-  if (pc->frame_type != KEY_FRAME) {
-    int i, j;
-    for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
-      for (j = 0; j < 4; j++) {
-        if (vp9_read(&header_bc, 252)) {
-          pc->fc.vp9_mode_contexts[i][j] = vp9_read_prob(&header_bc);
-        }
-      }
-    }
+  const uint8_t *data = pbi->source;
+  const uint8_t *data_end = pbi->source + pbi->source_sz;
+
+  struct vp9_read_bit_buffer rb = { data, data_end, 0,
+                                    pc, error_handler };
+  const size_t first_partition_size = read_uncompressed_header(pbi, &rb);
+  const int keyframe = pc->frame_type == KEY_FRAME;
+  YV12_BUFFER_CONFIG *new_fb = &pc->yv12_fb[pc->new_fb_idx];
+
+  if (!first_partition_size) {
+    // showing a frame directly
+    *p_data_end = data + 1;
+    return 0;
   }
-#if CONFIG_MODELCOEFPROB && ADJUST_KF_COEF_PROBS
-  if (pc->frame_type == KEY_FRAME)
-    vp9_adjust_default_coef_probs(pc);
-#endif
+  data += vp9_rb_bytes_read(&rb);
+  xd->corrupted = 0;
+  new_fb->corrupted = 0;
 
-#if CONFIG_NEW_MVREF
-  // If Key frame reset mv ref id probabilities to defaults
-  if (pc->frame_type != KEY_FRAME) {
-    // Read any mv_ref index probability updates
-    int i, j;
+  if (!pbi->decoded_key_frame && !keyframe)
+    return -1;
 
-    for (i = 0; i < MAX_REF_FRAMES; ++i) {
-      // Skip the dummy entry for intra ref frame.
-      if (i == INTRA_FRAME) {
-        continue;
-      }
+  if (!read_is_valid(data, first_partition_size, data_end))
+    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt header length");
 
-      // Read any updates to probabilities
-      for (j = 0; j < MAX_MV_REF_CANDIDATES - 1; ++j) {
-        if (vp9_read(&header_bc, VP9_MVREF_UPDATE_PROB)) {
-          xd->mb_mv_ref_probs[i][j] = vp9_read_prob(&header_bc);
-        }
-      }
-    }
-  }
-#endif
+  xd->mode_info_context = pc->mi;
+  xd->prev_mode_info_context = pc->prev_mi;
+  xd->frame_type = pc->frame_type;
+  xd->mode_info_stride = pc->mode_info_stride;
 
-  if (0) {
-    FILE *z = fopen("decodestats.stt", "a");
-    fprintf(z, "%6d F:%d,R:%d,Q:%d\n",
-            pc->current_video_frame,
-            pc->frame_type,
-            pbi->refresh_frame_flags,
-            pc->base_qindex);
-    fclose(z);
-  }
+  if (vp9_reader_init(&header_bc, data, first_partition_size))
+    vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate bool decoder 0");
 
-  update_frame_context(pbi, &header_bc);
+  mb_init_dequantizer(pc, &pbi->mb);  // MB level dequantizer setup
 
+  if (!keyframe)
+    vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
+
+  pc->fc = pc->frame_contexts[pc->frame_context_idx];
+
+  update_frame_context(&pc->fc);
+
+  setup_txfm_mode(pc, xd->lossless, &header_bc);
+
+  read_coef_probs(pbi, &header_bc);
+
   // Initialize xd pointers. Any reference should do for xd->pre, so use 0.
-  vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->active_ref_idx[0]],
-             sizeof(YV12_BUFFER_CONFIG));
-  vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx],
-             sizeof(YV12_BUFFER_CONFIG));
+  setup_pre_planes(xd, &pc->yv12_fb[pc->active_ref_idx[0]], NULL,
+                   0, 0, NULL, NULL);
+  setup_dst_planes(xd, new_fb, 0, 0);
 
   // Create the segmentation map structure and set to 0
   if (!pc->last_frame_seg_map)
     CHECK_MEM_ERROR(pc->last_frame_seg_map,
-                    vpx_calloc((pc->mb_rows * pc->mb_cols), 1));
+                    vpx_calloc((pc->mi_rows * pc->mi_cols), 1));
 
-  /* set up frame new frame for intra coded blocks */
-  vp9_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);
+  vp9_setup_block_dptrs(xd, pc->subsampling_x, pc->subsampling_y);
 
-  vp9_setup_block_dptrs(xd);
+  // clear out the coeff buffer
+  for (i = 0; i < MAX_MB_PLANE; ++i)
+    vp9_zero(xd->plane[i].qcoeff);
 
-  vp9_build_block_doffsets(xd);
+  set_prev_mi(pc);
 
-  /* clear out the coeff buffer */
-  vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
-
-  /* Read the mb_no_coeff_skip flag */
-  pc->mb_no_coeff_skip = (int)vp9_read_bit(&header_bc);
-
   vp9_decode_mode_mvs_init(pbi, &header_bc);
 
-  decode_tiles(pbi, data, first_partition_length_in_bytes,
-               &header_bc, &residual_bc);
-  corrupt_tokens |= xd->corrupted;
+  decode_tiles(pbi, data, first_partition_size, &residual_bc);
 
-  // keep track of the last coded dimensions
   pc->last_width = pc->width;
   pc->last_height = pc->height;
 
-  // Collect information about decoder corruption.
-  // 1. Check first boolean decoder for errors.
-  // 2. Check the macroblock information
-  pc->yv12_fb[pc->new_fb_idx].corrupted = bool_error(&header_bc) |
-                                          corrupt_tokens;
+  new_fb->corrupted = vp9_reader_has_error(&header_bc) | xd->corrupted;
 
   if (!pbi->decoded_key_frame) {
-    if (pc->frame_type == KEY_FRAME && !pc->yv12_fb[pc->new_fb_idx].corrupted)
+    if (keyframe && !new_fb->corrupted)
       pbi->decoded_key_frame = 1;
     else
-      vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,
+      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                          "A stream must start with a complete key frame");
   }
 
+  // Adaptation
   if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) {
     vp9_adapt_coef_probs(pc);
-#if CONFIG_CODE_NONZEROCOUNT
-    vp9_adapt_nzc_probs(pc);
-#endif
-  }
 
-  if (pc->frame_type != KEY_FRAME) {
-    if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) {
+    if ((!keyframe) && (!pc->intra_only)) {
       vp9_adapt_mode_probs(pc);
+      vp9_adapt_mode_context(pc);
       vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
-      vp9_adapt_mode_context(&pbi->common);
     }
   }
 
-  if (pc->refresh_entropy_probs) {
-    vpx_memcpy(&pc->frame_contexts[pc->frame_context_idx], &pc->fc,
-               sizeof(pc->fc));
-  }
+  if (pc->refresh_frame_context)
+    pc->frame_contexts[pc->frame_context_idx] = pc->fc;
 
-#ifdef PACKET_TESTING
-  {
-    FILE *f = fopen("decompressor.VP8", "ab");
-    unsigned int size = residual_bc.pos + header_bc.pos + 8;
-    fwrite((void *) &size, 4, 1, f);
-    fwrite((void *) pbi->Source, size, 1, f);
-    fclose(f);
-  }
-#endif
-
-  /* Find the end of the coded buffer */
-  while (residual_bc.count > CHAR_BIT &&
-         residual_bc.count < VP9_BD_VALUE_SIZE) {
-    residual_bc.count -= CHAR_BIT;
-    residual_bc.user_buffer--;
-  }
-  *p_data_end = residual_bc.user_buffer;
+  *p_data_end = vp9_reader_find_end(&residual_bc);
   return 0;
 }
--- a/vp9/decoder/vp9_decodframe.h
+++ b/vp9/decoder/vp9_decodframe.h
@@ -12,8 +12,11 @@
 #ifndef VP9_DECODER_VP9_DECODFRAME_H_
 #define VP9_DECODER_VP9_DECODFRAME_H_
 
+struct VP9Common;
 struct VP9Decompressor;
 
-void vp9_init_de_quantizer(struct VP9Decompressor *pbi);
+void vp9_init_dequantizer(struct VP9Common *pc);
+int vp9_decode_frame(struct VP9Decompressor *cpi, const uint8_t **p_data_end);
+vp9_prob vp9_read_prob_diff_update(vp9_reader *r, int oldp);
 
 #endif  // VP9_DECODER_VP9_DECODFRAME_H_
--- a/vp9/decoder/vp9_dequantize.c
+++ /dev/null
@@ -1,401 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9_rtcd.h"
-#include "vp9/decoder/vp9_dequantize.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/decoder/vp9_onyxd_int.h"
-#include "vp9/common/vp9_common.h"
-
-
-static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch,
-                         uint8_t *dest, int stride, int width, int height) {
-  int r, c;
-
-  for (r = 0; r < height; r++) {
-    for (c = 0; c < width; c++)
-      dest[c] = clip_pixel(diff[c] + pred[c]);
-
-    dest += stride;
-    diff += width;
-    pred += pitch;
-  }
-}
-
-void vp9_add_residual_4x4_c(const int16_t *diff, const uint8_t *pred, int pitch,
-                         uint8_t *dest, int stride) {
-  add_residual(diff, pred, pitch, dest, stride, 4, 4);
-}
-
-void vp9_add_residual_8x8_c(const int16_t *diff, const uint8_t *pred, int pitch,
-                         uint8_t *dest, int stride) {
-  add_residual(diff, pred, pitch, dest, stride, 8, 8);
-}
-
-void vp9_add_residual_16x16_c(const int16_t *diff, const uint8_t *pred,
-                              int pitch, uint8_t *dest, int stride) {
-  add_residual(diff, pred, pitch, dest, stride, 16, 16);
-}
-
-void vp9_add_residual_32x32_c(const int16_t *diff, const uint8_t *pred,
-                              int pitch, uint8_t *dest, int stride) {
-  add_residual(diff, pred, pitch, dest, stride, 32, 32);
-}
-
-static void add_constant_residual(const int16_t diff, const uint8_t *pred,
-                                  int pitch, uint8_t *dest, int stride,
-                                  int width, int height) {
-  int r, c;
-
-  for (r = 0; r < height; r++) {
-    for (c = 0; c < width; c++)
-      dest[c] = clip_pixel(diff + pred[c]);
-
-    dest += stride;
-    pred += pitch;
-  }
-}
-
-void vp9_add_constant_residual_8x8_c(const int16_t diff, const uint8_t *pred,
-                                     int pitch, uint8_t *dest, int stride) {
-  add_constant_residual(diff, pred, pitch, dest, stride, 8, 8);
-}
-
-void vp9_add_constant_residual_16x16_c(const int16_t diff, const uint8_t *pred,
-                                       int pitch, uint8_t *dest, int stride) {
-  add_constant_residual(diff, pred, pitch, dest, stride, 16, 16);
-}
-
-void vp9_add_constant_residual_32x32_c(const int16_t diff, const uint8_t *pred,
-                                       int pitch, uint8_t *dest, int stride) {
-  add_constant_residual(diff, pred, pitch, dest, stride, 32, 32);
-}
-
-void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input,
-                               const int16_t *dq,
-                               uint8_t *pred, uint8_t *dest,
-                               int pitch, int stride, int eob) {
-  int i;
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
-  for (i = 0; i < 16; i++)
-    input[i] *= dq[i];
-
-  vp9_short_iht4x4(input, output, 4, tx_type);
-  vpx_memset(input, 0, 32);
-  vp9_add_residual_4x4(output, pred, pitch, dest, stride);
-}
-
-void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,
-                                   const int16_t *dq,
-                                   uint8_t *pred, uint8_t *dest,
-                                   int pitch, int stride, int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64);
-
-  if (eob == 0) {
-    // All 0 DCT coefficients
-    vp9_copy_mem8x8(pred, pitch, dest, stride);
-  } else if (eob > 0) {
-    int i;
-
-    input[0] *= dq[0];
-    for (i = 1; i < 64; i++)
-      input[i] *= dq[1];
-
-    vp9_short_iht8x8(input, output, 8, tx_type);
-    vpx_memset(input, 0, 128);
-    vp9_add_residual_8x8(output, pred, pitch, dest, stride);
-  }
-}
-
-void vp9_dequant_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred,
-                            uint8_t *dest, int pitch, int stride, int eob) {
-  int i;
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
-  if (eob > 1) {
-    for (i = 0; i < 16; i++)
-      input[i] *= dq[i];
-
-    // the idct halves ( >> 1) the pitch
-    vp9_short_idct4x4(input, output, 4 << 1);
-
-    vpx_memset(input, 0, 32);
-
-    vp9_add_residual_4x4(output, pred, pitch, dest, stride);
-  } else {
-    vp9_dc_only_idct_add(input[0]*dq[0], pred, dest, pitch, stride);
-    ((int *)input)[0] = 0;
-  }
-}
-
-void vp9_dequant_dc_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred,
-                               uint8_t *dest, int pitch, int stride, int dc) {
-  int i;
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
-  input[0] = dc;
-
-  for (i = 1; i < 16; i++)
-    input[i] *= dq[i];
-
-  // the idct halves ( >> 1) the pitch
-  vp9_short_idct4x4(input, output, 4 << 1);
-  vpx_memset(input, 0, 32);
-  vp9_add_residual_4x4(output, pred, pitch, dest, stride);
-}
-
-void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq,
-                                     uint8_t *pred, uint8_t *dest,
-                                     int pitch, int stride, int eob) {
-  int i;
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
-  if (eob > 1) {
-    for (i = 0; i < 16; i++)
-      input[i] *= dq[i];
-
-    vp9_short_iwalsh4x4_c(input, output, 4 << 1);
-
-    vpx_memset(input, 0, 32);
-
-    vp9_add_residual_4x4(output, pred, pitch, dest, stride);
-  } else {
-    vp9_dc_only_inv_walsh_add(input[0]*dq[0], pred, dest, pitch, stride);
-    ((int *)input)[0] = 0;
-  }
-}
-
-void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq,
-                                        uint8_t *pred,
-                                        uint8_t *dest,
-                                        int pitch, int stride, int dc) {
-  int i;
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
-  input[0] = dc;
-
-  for (i = 1; i < 16; i++)
-    input[i] *= dq[i];
-
-  vp9_short_iwalsh4x4_c(input, output, 4 << 1);
-  vpx_memset(input, 0, 32);
-  vp9_add_residual_4x4(output, pred, pitch, dest, stride);
-}
-
-void vp9_dequant_idct_add_8x8_c(int16_t *input, const int16_t *dq,
-                                uint8_t *pred, uint8_t *dest, int pitch,
-                                int stride, int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64);
-
-  // If dc is 1, then input[0] is the reconstructed value, do not need
-  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
-  input[0] *= dq[0];
-
-  // The calculation can be simplified if there are not many non-zero dct
-  // coefficients. Use eobs to decide what to do.
-  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
-  // Combine that with code here.
-  if (eob == 0) {
-    // All 0 DCT coefficients
-    vp9_copy_mem8x8(pred, pitch, dest, stride);
-  } else if (eob == 1) {
-    // DC only DCT coefficient
-    int16_t in = input[0];
-    int16_t out;
-
-     // Note: the idct1 will need to be modified accordingly whenever
-     // vp9_short_idct8x8_c() is modified.
-    vp9_short_idct1_8x8_c(&in, &out);
-    input[0] = 0;
-
-    vp9_add_constant_residual_8x8(out, pred, pitch, dest, stride);
-#if !CONFIG_SCATTERSCAN
-  } else if (eob <= 10) {
-    input[1] *= dq[1];
-    input[2] *= dq[1];
-    input[3] *= dq[1];
-    input[8] *= dq[1];
-    input[9] *= dq[1];
-    input[10] *= dq[1];
-    input[16] *= dq[1];
-    input[17] *= dq[1];
-    input[24] *= dq[1];
-
-    vp9_short_idct10_8x8(input, output, 16);
-
-    input[0] = input[1] = input[2] = input[3] = 0;
-    input[8] = input[9] = input[10] = 0;
-    input[16] = input[17] = 0;
-    input[24] = 0;
-
-    vp9_add_residual_8x8(output, pred, pitch, dest, stride);
-#endif
-  } else {
-    int i;
-
-    // recover quantizer for 4 4x4 blocks
-    for (i = 1; i < 64; i++)
-      input[i] *= dq[1];
-
-    // the idct halves ( >> 1) the pitch
-    vp9_short_idct8x8(input, output, 8 << 1);
-    vpx_memset(input, 0, 128);
-    vp9_add_residual_8x8(output, pred, pitch, dest, stride);
-  }
-}
-
-void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,
-                                     const int16_t *dq, uint8_t *pred,
-                                     uint8_t *dest, int pitch, int stride,
-                                     int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);
-
-  if (eob == 0) {
-    // All 0 DCT coefficients
-    vp9_copy_mem16x16(pred, pitch, dest, stride);
-  } else if (eob > 0) {
-    int i;
-
-    input[0] *= dq[0];
-
-    // recover quantizer for 4 4x4 blocks
-    for (i = 1; i < 256; i++)
-      input[i] *= dq[1];
-
-    // inverse hybrid transform
-    vp9_short_iht16x16(input, output, 16, tx_type);
-
-    // the idct halves ( >> 1) the pitch
-    // vp9_short_idct16x16(input, output, 32);
-
-    vpx_memset(input, 0, 512);
-
-    vp9_add_residual_16x16(output, pred, pitch, dest, stride);
-  }
-}
-
-void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq,
-                                  uint8_t *pred, uint8_t *dest, int pitch,
-                                  int stride, int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);
-
-  /* The calculation can be simplified if there are not many non-zero dct
-   * coefficients. Use eobs to separate different cases. */
-  if (eob == 0) {
-    /* All 0 DCT coefficient */
-    vp9_copy_mem16x16(pred, pitch, dest, stride);
-  } else if (eob == 1) {
-    /* DC only DCT coefficient. */
-    int16_t in = input[0] * dq[0];
-    int16_t out;
-    /* Note: the idct1 will need to be modified accordingly whenever
-     * vp9_short_idct16x16() is modified. */
-    vp9_short_idct1_16x16_c(&in, &out);
-    input[0] = 0;
-
-    vp9_add_constant_residual_16x16(out, pred, pitch, dest, stride);
-#if !CONFIG_SCATTERSCAN
-  } else if (eob <= 10) {
-    input[0] *= dq[0];
-
-    input[1] *= dq[1];
-    input[2] *= dq[1];
-    input[3] *= dq[1];
-    input[16] *= dq[1];
-    input[17] *= dq[1];
-    input[18] *= dq[1];
-    input[32] *= dq[1];
-    input[33] *= dq[1];
-    input[48] *= dq[1];
-
-    // the idct halves ( >> 1) the pitch
-    vp9_short_idct10_16x16(input, output, 32);
-
-    input[0] = input[1] = input[2] = input[3] = 0;
-    input[16] = input[17] = input[18] = 0;
-    input[32] = input[33] = 0;
-    input[48] = 0;
-
-    vp9_add_residual_16x16(output, pred, pitch, dest, stride);
-#endif
-  } else {
-    int i;
-
-    input[0] *= dq[0];
-
-    // recover quantizer for 4 4x4 blocks
-    for (i = 1; i < 256; i++)
-      input[i] *= dq[1];
-
-    // the idct halves ( >> 1) the pitch
-    vp9_short_idct16x16(input, output, 16 << 1);
-
-    vpx_memset(input, 0, 512);
-
-    vp9_add_residual_16x16(output, pred, pitch, dest, stride);
-  }
-}
-
-void vp9_dequant_idct_add_32x32_c(int16_t *input, const int16_t *dq,
-                                  uint8_t *pred, uint8_t *dest, int pitch,
-                                  int stride, int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 1024);
-
-  if (eob) {
-    input[0] = input[0] * dq[0] / 2;
-    if (eob == 1) {
-      vp9_short_idct1_32x32(input, output);
-      vp9_add_constant_residual_32x32(output[0], pred, pitch, dest, stride);
-      input[0] = 0;
-#if !CONFIG_SCATTERSCAN
-    } else if (eob <= 10) {
-      input[1] = input[1] * dq[1] / 2;
-      input[2] = input[2] * dq[1] / 2;
-      input[3] = input[3] * dq[1] / 2;
-      input[32] = input[32] * dq[1] / 2;
-      input[33] = input[33] * dq[1] / 2;
-      input[34] = input[34] * dq[1] / 2;
-      input[64] = input[64] * dq[1] / 2;
-      input[65] = input[65] * dq[1] / 2;
-      input[96] = input[96] * dq[1] / 2;
-
-      // the idct halves ( >> 1) the pitch
-      vp9_short_idct10_32x32(input, output, 64);
-
-      input[0] = input[1] = input[2] = input[3] = 0;
-      input[32] = input[33] = input[34] = 0;
-      input[64] = input[65] = 0;
-      input[96] = 0;
-
-      vp9_add_residual_32x32(output, pred, pitch, dest, stride);
-#endif
-    } else {
-      int i;
-      for (i = 1; i < 1024; i++)
-        input[i] = input[i] * dq[1] / 2;
-      vp9_short_idct32x32(input, output, 64);
-      vpx_memset(input, 0, 2048);
-      vp9_add_residual_32x32(output, pred, pitch, dest, stride);
-    }
-  }
-}
-
-void vp9_dequant_idct_add_uv_block_16x16_c(int16_t *q, const int16_t *dq,
-                                           uint8_t *dstu,
-                                           uint8_t *dstv,
-                                           int stride,
-                                           MACROBLOCKD *xd) {
-  vp9_dequant_idct_add_16x16_c(q, dq, dstu, dstu, stride, stride,
-                               xd->eobs[64]);
-  vp9_dequant_idct_add_16x16_c(q + 256, dq, dstv, dstv, stride, stride,
-                               xd->eobs[80]);
-}
--- a/vp9/decoder/vp9_dequantize.h
+++ /dev/null
@@ -1,96 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_DECODER_VP9_DEQUANTIZE_H_
-#define VP9_DECODER_VP9_DEQUANTIZE_H_
-
-#include "vp9/common/vp9_blockd.h"
-
-
-void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq,
-                                     unsigned char *pred,
-                                     unsigned char *output,
-                                     int pitch, int stride, int eob);
-
-void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq,
-                                        unsigned char *pred,
-                                        unsigned char *output,
-                                        int pitch, int stride, int dc);
-
-void vp9_dequant_dc_idct_add_y_block_lossless_c(int16_t *q,
-                                                const int16_t *dq,
-                                                unsigned char *pre,
-                                                unsigned char *dst,
-                                                int stride,
-                                                const int16_t *dc);
-
-void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,
-                                             unsigned char *pre,
-                                             unsigned char *dst,
-                                             int stride,
-                                             struct macroblockd *xd);
-
-void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq,
-                                              unsigned char *pre,
-                                              unsigned char *dst_u,
-                                              unsigned char *dst_v,
-                                              int stride,
-                                              struct macroblockd *xd);
-
-void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq,
-                                    unsigned char *pred, unsigned char *dest,
-                                    int pitch, int stride, int eob);
-
-void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,
-                                   const int16_t *dq, unsigned char *pred,
-                                   unsigned char *dest, int pitch, int stride,
-                                   int eob);
-
-void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,
-                                     const int16_t *dq, unsigned char *pred,
-                                     unsigned char *dest,
-                                     int pitch, int stride, int eob);
-
-void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(int16_t *q, const int16_t *dq,
-                                                   unsigned char *dst,
-                                                   int stride,
-                                                   const int16_t *dc,
-                                                   MACROBLOCKD *xd);
-
-void vp9_dequant_idct_add_y_block_8x8_inplace_c(int16_t *q, const int16_t *dq,
-                                                unsigned char *dst,
-                                                int stride,
-                                                MACROBLOCKD *xd);
-
-void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
-                                                   unsigned char *dst,
-                                                   int stride,
-                                                   const int16_t *dc,
-                                                   MACROBLOCKD *xd);
-
-void vp9_dequant_idct_add_y_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
-                                                unsigned char *dst,
-                                                int stride,
-                                                MACROBLOCKD *xd);
-
-void vp9_dequant_idct_add_uv_block_8x8_inplace_c(int16_t *q, const int16_t *dq,
-                                                 unsigned char *dstu,
-                                                 unsigned char *dstv,
-                                                 int stride,
-                                                 MACROBLOCKD *xd);
-
-void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
-                                                 unsigned char *dstu,
-                                                 unsigned char *dstv,
-                                                 int stride,
-                                                 MACROBLOCKD *xd);
-
-#endif  // VP9_DECODER_VP9_DEQUANTIZE_H_
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -10,6 +10,7 @@
 
 
 #include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
@@ -16,8 +17,13 @@
 #include "vp9/decoder/vp9_detokenize.h"
 #include "vp9/common/vp9_seg_common.h"
 
+#if CONFIG_BALANCED_COEFTREE
+#define ZERO_CONTEXT_NODE           0
+#define EOB_CONTEXT_NODE            1
+#else
 #define EOB_CONTEXT_NODE            0
 #define ZERO_CONTEXT_NODE           1
+#endif
 #define ONE_CONTEXT_NODE            2
 #define LOW_VAL_CONTEXT_NODE        3
 #define TWO_CONTEXT_NODE            4
@@ -57,236 +63,185 @@
   254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0
 };
 
-DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
-
-static int16_t get_signed(BOOL_DECODER *br, int16_t value_to_sign) {
-  return decode_bool(br, 128) ? -value_to_sign : value_to_sign;
-}
-
-
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
 #define INCREMENT_COUNT(token)               \
   do {                                       \
-    coef_counts[type][ref][get_coef_band(scan, txfm_size, c)] \
-               [pt][token]++;     \
-    token_cache[c] = token; \
-    pt = vp9_get_coef_context(scan, nb, pad, token_cache,     \
-                              c + 1, default_eob); \
+    coef_counts[type][ref][band][pt]         \
+               [token >= TWO_TOKEN ?     \
+                (token == DCT_EOB_TOKEN ? DCT_EOB_MODEL_TOKEN : TWO_TOKEN) : \
+                token]++;     \
+    token_cache[scan[c]] = vp9_pt_energy_class[token]; \
   } while (0)
 
-#if CONFIG_CODE_NONZEROCOUNT
-#define WRITE_COEF_CONTINUE(val, token)                       \
-  {                                                           \
-    qcoeff_ptr[scan[c]] = get_signed(br, val);                \
-    INCREMENT_COUNT(token);                                   \
-    c++;                                                      \
-    nzc++;                                                    \
-    continue;                                                 \
-  }
-#else
 #define WRITE_COEF_CONTINUE(val, token)                  \
   {                                                      \
-    qcoeff_ptr[scan[c]] = get_signed(br, val);           \
+    qcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \
+                            dq[c > 0] / (1 + (txfm_size == TX_32X32)); \
     INCREMENT_COUNT(token);                              \
     c++;                                                 \
     continue;                                            \
   }
-#endif  // CONFIG_CODE_NONZEROCOUNT
 
 #define ADJUST_COEF(prob, bits_count)  \
   do {                                 \
-    if (vp9_read(br, prob))            \
+    if (vp9_read(r, prob))             \
       val += 1 << bits_count;          \
   } while (0);
 
 static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
-                        BOOL_DECODER* const br, int block_idx,
+                        vp9_reader *r, int block_idx,
                         PLANE_TYPE type, int seg_eob, int16_t *qcoeff_ptr,
-                        TX_SIZE txfm_size) {
-  ENTROPY_CONTEXT* const A0 = (ENTROPY_CONTEXT *) xd->above_context;
-  ENTROPY_CONTEXT* const L0 = (ENTROPY_CONTEXT *) xd->left_context;
-  int aidx, lidx;
+                        TX_SIZE txfm_size, const int16_t *dq,
+                        ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
   ENTROPY_CONTEXT above_ec, left_ec;
   FRAME_CONTEXT *const fc = &dx->common.fc;
   int pt, c = 0, pad, default_eob;
-  vp9_coeff_probs *coef_probs;
+  int band;
+  vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES];
+  vp9_prob coef_probs_full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+  uint8_t load_map[COEF_BANDS][PREV_COEF_CONTEXTS] = {
+    {0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0},
+    {0, 0, 0, 0, 0, 0},
+  };
+
   vp9_prob *prob;
-  vp9_coeff_count *coef_counts;
-  const int ref = xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME;
-#if CONFIG_CODE_NONZEROCOUNT
-  uint16_t nzc = 0;
-  uint16_t nzc_expected = xd->mode_info_context->mbmi.nzcs[block_idx];
-#endif
+  vp9_coeff_count_model *coef_counts;
+  const int ref = xd->mode_info_context->mbmi.ref_frame[0] != INTRA_FRAME;
+  TX_TYPE tx_type = DCT_DCT;
   const int *scan, *nb;
   uint8_t token_cache[1024];
+  const uint8_t * band_translate;
+#if CONFIG_BALANCED_COEFTREE
+  int skip_eob_node = 0;
+#endif
 
-  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
-    aidx = vp9_block2above_sb64[txfm_size][block_idx];
-    lidx = vp9_block2left_sb64[txfm_size][block_idx];
-  } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {
-    aidx = vp9_block2above_sb[txfm_size][block_idx];
-    lidx = vp9_block2left_sb[txfm_size][block_idx];
-  } else {
-    aidx = vp9_block2above[txfm_size][block_idx];
-    lidx = vp9_block2left[txfm_size][block_idx];
-  }
-
+  coef_probs  = fc->coef_probs[txfm_size][type][ref];
+  coef_counts = fc->coef_counts[txfm_size];
   switch (txfm_size) {
     default:
     case TX_4X4: {
-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                              get_tx_type_4x4(xd, block_idx) : DCT_DCT;
-      switch (tx_type) {
-        default:
-          scan = vp9_default_zig_zag1d_4x4;
-          break;
-        case ADST_DCT:
-          scan = vp9_row_scan_4x4;
-          break;
-        case DCT_ADST:
-          scan = vp9_col_scan_4x4;
-          break;
-      }
-      above_ec = A0[aidx] != 0;
-      left_ec = L0[lidx] != 0;
-      coef_probs  = fc->coef_probs_4x4;
-      coef_counts = fc->coef_counts_4x4;
+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+          get_tx_type_4x4(xd, block_idx) : DCT_DCT;
+      scan = get_scan_4x4(tx_type);
+      above_ec = A[0] != 0;
+      left_ec = L[0] != 0;
       default_eob = 16;
+      band_translate = vp9_coefband_trans_4x4;
       break;
     }
     case TX_8X8: {
       const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
-      const int sz = 3 + sb_type, x = block_idx & ((1 << sz) - 1);
+      const int sz = 1 + b_width_log2(sb_type);
+      const int x = block_idx & ((1 << sz) - 1);
       const int y = block_idx - x;
-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                              get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
-      switch (tx_type) {
-        default:
-          scan = vp9_default_zig_zag1d_8x8;
-          break;
-        case ADST_DCT:
-          scan = vp9_row_scan_8x8;
-          break;
-        case DCT_ADST:
-          scan = vp9_col_scan_8x8;
-          break;
-      }
-      coef_probs  = fc->coef_probs_8x8;
-      coef_counts = fc->coef_counts_8x8;
-      above_ec = (A0[aidx] + A0[aidx + 1]) != 0;
-      left_ec  = (L0[lidx] + L0[lidx + 1]) != 0;
+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+          get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
+      scan = get_scan_8x8(tx_type);
+      above_ec = (A[0] + A[1]) != 0;
+      left_ec = (L[0] + L[1]) != 0;
       default_eob = 64;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_16X16: {
       const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
-      const int sz = 4 + sb_type, x = block_idx & ((1 << sz) - 1);
+      const int sz = 2 + b_width_log2(sb_type);
+      const int x = block_idx & ((1 << sz) - 1);
       const int y = block_idx - x;
-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                              get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
-      switch (tx_type) {
-        default:
-          scan = vp9_default_zig_zag1d_16x16;
-          break;
-        case ADST_DCT:
-          scan = vp9_row_scan_16x16;
-          break;
-        case DCT_ADST:
-          scan = vp9_col_scan_16x16;
-          break;
-      }
-      coef_probs  = fc->coef_probs_16x16;
-      coef_counts = fc->coef_counts_16x16;
-      if (type == PLANE_TYPE_UV) {
-        ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);
-        ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);
-        above_ec = (A0[aidx] + A0[aidx + 1] + A1[aidx] + A1[aidx + 1]) != 0;
-        left_ec  = (L0[lidx] + L0[lidx + 1] + L1[lidx] + L1[lidx + 1]) != 0;
-      } else {
-        above_ec = (A0[aidx] + A0[aidx + 1] + A0[aidx + 2] + A0[aidx + 3]) != 0;
-        left_ec  = (L0[lidx] + L0[lidx + 1] + L0[lidx + 2] + L0[lidx + 3]) != 0;
-      }
+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+          get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
+      scan = get_scan_16x16(tx_type);
+      above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
+      left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
       default_eob = 256;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_32X32:
-      scan = vp9_default_zig_zag1d_32x32;
-      coef_probs = fc->coef_probs_32x32;
-      coef_counts = fc->coef_counts_32x32;
-      if (type == PLANE_TYPE_UV) {
-        ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);
-        ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);
-        ENTROPY_CONTEXT *A2 = (ENTROPY_CONTEXT *) (xd->above_context + 2);
-        ENTROPY_CONTEXT *L2 = (ENTROPY_CONTEXT *) (xd->left_context + 2);
-        ENTROPY_CONTEXT *A3 = (ENTROPY_CONTEXT *) (xd->above_context + 3);
-        ENTROPY_CONTEXT *L3 = (ENTROPY_CONTEXT *) (xd->left_context + 3);
-        above_ec = (A0[aidx] + A0[aidx + 1] + A1[aidx] + A1[aidx + 1] +
-                    A2[aidx] + A2[aidx + 1] + A3[aidx] + A3[aidx + 1]) != 0;
-        left_ec  = (L0[lidx] + L0[lidx + 1] + L1[lidx] + L1[lidx + 1] +
-                    L2[lidx] + L2[lidx + 1] + L3[lidx] + L3[lidx + 1]) != 0;
-      } else {
-        ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);
-        ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);
-        above_ec = (A0[aidx] + A0[aidx + 1] + A0[aidx + 2] + A0[aidx + 3] +
-                    A1[aidx] + A1[aidx + 1] + A1[aidx + 2] + A1[aidx + 3]) != 0;
-        left_ec  = (L0[lidx] + L0[lidx + 1] + L0[lidx + 2] + L0[lidx + 3] +
-                    L1[lidx] + L1[lidx + 1] + L1[lidx + 2] + L1[lidx + 3]) != 0;
-      }
+      scan = vp9_default_scan_32x32;
+      above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
+      left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
       default_eob = 1024;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
   }
 
-  VP9_COMBINEENTROPYCONTEXTS(pt, above_ec, left_ec);
+  pt = combine_entropy_contexts(above_ec, left_ec);
   nb = vp9_get_coef_neighbors_handle(scan, &pad);
 
   while (1) {
     int val;
     const uint8_t *cat6 = cat6_prob;
-
     if (c >= seg_eob)
       break;
-#if CONFIG_CODE_NONZEROCOUNT
-    if (nzc == nzc_expected)
+    if (c)
+      pt = vp9_get_coef_context(scan, nb, pad, token_cache,
+                                c, default_eob);
+    band = get_coef_band(band_translate, c);
+    prob = coef_probs[band][pt];
+#if !CONFIG_BALANCED_COEFTREE
+    fc->eob_branch_counts[txfm_size][type][ref][band][pt]++;
+    if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
       break;
-#endif
-    prob = coef_probs[type][ref][get_coef_band(scan, txfm_size, c)][pt];
-#if CONFIG_CODE_NONZEROCOUNT == 0
-    fc->eob_branch_counts[txfm_size][type][ref]
-                         [get_coef_band(scan, txfm_size, c)][pt]++;
-    if (!vp9_read(br, prob[EOB_CONTEXT_NODE]))
-      break;
-#endif
+
 SKIP_START:
+#endif
     if (c >= seg_eob)
       break;
-#if CONFIG_CODE_NONZEROCOUNT
-    if (nzc == nzc_expected)
-      break;
-    // decode zero node only if there are zeros left
-    if (seg_eob - nzc_expected - c + nzc > 0)
-#endif
-    if (!vp9_read(br, prob[ZERO_CONTEXT_NODE])) {
+    if (c)
+      pt = vp9_get_coef_context(scan, nb, pad, token_cache,
+                                c, default_eob);
+    band = get_coef_band(band_translate, c);
+    prob = coef_probs[band][pt];
+
+    if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
       ++c;
-      prob = coef_probs[type][ref][get_coef_band(scan, txfm_size, c)][pt];
+#if CONFIG_BALANCED_COEFTREE
+      skip_eob_node = 1;
+      continue;
+#else
       goto SKIP_START;
+#endif
     }
+#if CONFIG_BALANCED_COEFTREE
+    if (!skip_eob_node) {
+      fc->eob_branch_counts[txfm_size][type][ref][band][pt]++;
+      if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
+        break;
+    }
+    skip_eob_node = 0;
+#endif
+
     // ONE_CONTEXT_NODE_0_
-    if (!vp9_read(br, prob[ONE_CONTEXT_NODE])) {
+    if (!vp9_read(r, prob[ONE_CONTEXT_NODE])) {
       WRITE_COEF_CONTINUE(1, ONE_TOKEN);
     }
+    // Load full probabilities if not already loaded
+    if (!load_map[band][pt]) {
+      vp9_model_to_full_probs(coef_probs[band][pt],
+                              coef_probs_full[band][pt]);
+      load_map[band][pt] = 1;
+    }
+    prob = coef_probs_full[band][pt];
     // LOW_VAL_CONTEXT_NODE_0_
-    if (!vp9_read(br, prob[LOW_VAL_CONTEXT_NODE])) {
-      if (!vp9_read(br, prob[TWO_CONTEXT_NODE])) {
+    if (!vp9_read(r, prob[LOW_VAL_CONTEXT_NODE])) {
+      if (!vp9_read(r, prob[TWO_CONTEXT_NODE])) {
         WRITE_COEF_CONTINUE(2, TWO_TOKEN);
       }
-      if (!vp9_read(br, prob[THREE_CONTEXT_NODE])) {
+      if (!vp9_read(r, prob[THREE_CONTEXT_NODE])) {
         WRITE_COEF_CONTINUE(3, THREE_TOKEN);
       }
       WRITE_COEF_CONTINUE(4, FOUR_TOKEN);
     }
     // HIGH_LOW_CONTEXT_NODE_0_
-    if (!vp9_read(br, prob[HIGH_LOW_CONTEXT_NODE])) {
-      if (!vp9_read(br, prob[CAT_ONE_CONTEXT_NODE])) {
+    if (!vp9_read(r, prob[HIGH_LOW_CONTEXT_NODE])) {
+      if (!vp9_read(r, prob[CAT_ONE_CONTEXT_NODE])) {
         val = CAT1_MIN_VAL;
         ADJUST_COEF(CAT1_PROB0, 0);
         WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY1);
@@ -297,8 +252,8 @@
       WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY2);
     }
     // CAT_THREEFOUR_CONTEXT_NODE_0_
-    if (!vp9_read(br, prob[CAT_THREEFOUR_CONTEXT_NODE])) {
-      if (!vp9_read(br, prob[CAT_THREE_CONTEXT_NODE])) {
+    if (!vp9_read(r, prob[CAT_THREEFOUR_CONTEXT_NODE])) {
+      if (!vp9_read(r, prob[CAT_THREE_CONTEXT_NODE])) {
         val = CAT3_MIN_VAL;
         ADJUST_COEF(CAT3_PROB2, 2);
         ADJUST_COEF(CAT3_PROB1, 1);
@@ -313,7 +268,7 @@
       WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY4);
     }
     // CAT_FIVE_CONTEXT_NODE_0_:
-    if (!vp9_read(br, prob[CAT_FIVE_CONTEXT_NODE])) {
+    if (!vp9_read(r, prob[CAT_FIVE_CONTEXT_NODE])) {
       val = CAT5_MIN_VAL;
       ADJUST_COEF(CAT5_PROB4, 4);
       ADJUST_COEF(CAT5_PROB3, 3);
@@ -324,262 +279,73 @@
     }
     val = 0;
     while (*cat6) {
-      val = (val << 1) | vp9_read(br, *cat6++);
+      val = (val << 1) | vp9_read(r, *cat6++);
     }
     val += CAT6_MIN_VAL;
     WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY6);
   }
 
-#if CONFIG_CODE_NONZEROCOUNT == 0
   if (c < seg_eob)
-    coef_counts[type][ref][get_coef_band(scan, txfm_size, c)]
-               [pt][DCT_EOB_TOKEN]++;
-#endif
+    coef_counts[type][ref][band][pt][DCT_EOB_MODEL_TOKEN]++;
 
-  A0[aidx] = L0[lidx] = c > 0;
-  if (txfm_size >= TX_8X8) {
-    A0[aidx + 1] = L0[lidx + 1] = A0[aidx];
-    if (txfm_size >= TX_16X16) {
-      if (type == PLANE_TYPE_UV) {
-        ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);
-        ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);
-        A1[aidx] = A1[aidx + 1] = L1[lidx] = L1[lidx + 1] = A0[aidx];
-        if (txfm_size >= TX_32X32) {
-          ENTROPY_CONTEXT *A2 = (ENTROPY_CONTEXT *) (xd->above_context + 2);
-          ENTROPY_CONTEXT *L2 = (ENTROPY_CONTEXT *) (xd->left_context + 2);
-          ENTROPY_CONTEXT *A3 = (ENTROPY_CONTEXT *) (xd->above_context + 3);
-          ENTROPY_CONTEXT *L3 = (ENTROPY_CONTEXT *) (xd->left_context + 3);
-          A2[aidx] = A2[aidx + 1] = A3[aidx] = A3[aidx + 1] = A0[aidx];
-          L2[lidx] = L2[lidx + 1] = L3[lidx] = L3[lidx + 1] = A0[aidx];
-        }
-      } else {
-        A0[aidx + 2] = A0[aidx + 3] = L0[lidx + 2] = L0[lidx + 3] = A0[aidx];
-        if (txfm_size >= TX_32X32) {
-          ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);
-          ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);
-          A1[aidx] = A1[aidx + 1] = A1[aidx + 2] = A1[aidx + 3] = A0[aidx];
-          L1[lidx] = L1[lidx + 1] = L1[lidx + 2] = L1[lidx + 3] = A0[aidx];
-        }
-      }
-    }
-  }
+
   return c;
 }
 
 static int get_eob(MACROBLOCKD* const xd, int segment_id, int eob_max) {
-  return vp9_get_segdata(xd, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+  return vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
-static INLINE int decode_sb(VP9D_COMP* const pbi,
-                            MACROBLOCKD* const xd,
-                            BOOL_DECODER* const bc,
-                            int offset, int count, int inc,
-                            int eob_max, TX_SIZE tx_size) {
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  const int seg_eob = get_eob(xd, segment_id, eob_max);
-  int i, eobtotal = 0;
+struct decode_block_args {
+  VP9D_COMP *pbi;
+  MACROBLOCKD *xd;
+  vp9_reader *r;
+  int *eobtotal;
+};
+static void decode_block(int plane, int block,
+                         BLOCK_SIZE_TYPE bsize,
+                         int ss_txfrm_size,
+                         void *argv) {
+  const struct decode_block_args* const arg = argv;
+  const int bw = b_width_log2(bsize);
 
-  // luma blocks
-  for (i = 0; i < offset; i += inc) {
-    const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC, seg_eob,
-                               xd->qcoeff + i * 16, tx_size);
-    xd->eobs[i] = c;
-    eobtotal += c;
-  }
+  // find the maximum eob for this transform size, adjusted by segment
+  MACROBLOCKD *xd = arg->xd;
+  const int segment_id = arg->xd->mode_info_context->mbmi.segment_id;
+  const TX_SIZE ss_tx_size = ss_txfrm_size / 2;
+  const int seg_eob = get_eob(arg->xd, segment_id, 16 << ss_txfrm_size);
+  int16_t* const qcoeff_base = arg->xd->plane[plane].qcoeff;
+  const int off = block >> ss_txfrm_size;
+  const int mod = bw - ss_tx_size - arg->xd->plane[plane].subsampling_x;
+  const int aoff = (off & ((1 << mod) - 1)) << ss_tx_size;
+  const int loff = (off >> mod) << ss_tx_size;
+  int pt;
+  ENTROPY_CONTEXT *A = arg->xd->plane[plane].above_context + aoff;
+  ENTROPY_CONTEXT *L = arg->xd->plane[plane].left_context + loff;
+  const int eob = decode_coefs(arg->pbi, arg->xd, arg->r, block,
+                               arg->xd->plane[plane].plane_type, seg_eob,
+                               BLOCK_OFFSET(qcoeff_base, block, 16),
+                               ss_tx_size, arg->xd->plane[plane].dequant,
+                               A,
+                               L);
 
-  // chroma blocks
-  for (i = offset; i < count; i += inc) {
-    const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob,
-                               xd->qcoeff + i * 16, tx_size);
-    xd->eobs[i] = c;
-    eobtotal += c;
-  }
-
-  return eobtotal;
-}
-
-int vp9_decode_sb_tokens(VP9D_COMP* const pbi,
-                         MACROBLOCKD* const xd,
-                         BOOL_DECODER* const bc) {
-  switch (xd->mode_info_context->mbmi.txfm_size) {
-    case TX_32X32: {
-      // 32x32 luma block
-      const int segment_id = xd->mode_info_context->mbmi.segment_id;
-      int i, eobtotal = 0, seg_eob;
-      int c = decode_coefs(pbi, xd, bc, 0, PLANE_TYPE_Y_WITH_DC,
-                       get_eob(xd, segment_id, 1024), xd->qcoeff, TX_32X32);
-      xd->eobs[0] = c;
-      eobtotal += c;
-
-      // 16x16 chroma blocks
-      seg_eob = get_eob(xd, segment_id, 256);
-      for (i = 64; i < 96; i += 16) {
-        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob,
-                         xd->qcoeff + i * 16, TX_16X16);
-        xd->eobs[i] = c;
-        eobtotal += c;
-      }
-      return eobtotal;
-    }
-    case TX_16X16:
-      return decode_sb(pbi, xd, bc, 64, 96, 16, 16 * 16, TX_16X16);
-    case TX_8X8:
-      return decode_sb(pbi, xd, bc, 64, 96, 4, 8 * 8, TX_8X8);
-    case TX_4X4:
-      return decode_sb(pbi, xd, bc, 64, 96, 1, 4 * 4, TX_4X4);
-    default:
-      assert(0);
-      return 0;
-  }
-}
-
-int vp9_decode_sb64_tokens(VP9D_COMP* const pbi,
-                           MACROBLOCKD* const xd,
-                           BOOL_DECODER* const bc) {
-  switch (xd->mode_info_context->mbmi.txfm_size) {
-    case TX_32X32:
-      return decode_sb(pbi, xd, bc, 256, 384, 64, 32 * 32, TX_32X32);
-    case TX_16X16:
-      return decode_sb(pbi, xd, bc, 256, 384, 16, 16 * 16, TX_16X16);
-    case TX_8X8:
-      return decode_sb(pbi, xd, bc, 256, 384, 4, 8 * 8, TX_8X8);
-    case TX_4X4:
-      return decode_sb(pbi, xd, bc, 256, 384, 1, 4 * 4, TX_4X4);
-    default:
-      assert(0);
-      return 0;
-  }
-}
-
-static int vp9_decode_mb_tokens_16x16(VP9D_COMP* const pbi,
-                                      MACROBLOCKD* const xd,
-                                      BOOL_DECODER* const bc) {
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  int i, eobtotal = 0, seg_eob;
-
-  // Luma block
-  int c = decode_coefs(pbi, xd, bc, 0, PLANE_TYPE_Y_WITH_DC,
-                       get_eob(xd, segment_id, 256), xd->qcoeff, TX_16X16);
-  xd->eobs[0] = c;
-  eobtotal += c;
-
-  // 8x8 chroma blocks
-  seg_eob = get_eob(xd, segment_id, 64);
-  for (i = 16; i < 24; i += 4) {
-    c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV,
-                     seg_eob, xd->block[i].qcoeff, TX_8X8);
-    xd->eobs[i] = c;
-    eobtotal += c;
-  }
-  return eobtotal;
-}
-
-static int vp9_decode_mb_tokens_8x8(VP9D_COMP* const pbi,
-                                    MACROBLOCKD* const xd,
-                                    BOOL_DECODER* const bc) {
-  int i, eobtotal = 0;
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
-
-  // luma blocks
-  int seg_eob = get_eob(xd, segment_id, 64);
-  for (i = 0; i < 16; i += 4) {
-    const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC,
-                               seg_eob, xd->block[i].qcoeff, TX_8X8);
-    xd->eobs[i] = c;
-    eobtotal += c;
-  }
-
-  // chroma blocks
-  if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
-      xd->mode_info_context->mbmi.mode == SPLITMV) {
-    // use 4x4 transform for U, V components in I8X8/splitmv prediction mode
-    seg_eob = get_eob(xd, segment_id, 16);
-    for (i = 16; i < 24; i++) {
-      const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV,
-                                 seg_eob, xd->block[i].qcoeff, TX_4X4);
-      xd->eobs[i] = c;
-      eobtotal += c;
-    }
+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
+    set_contexts_on_border(xd, bsize, plane, ss_tx_size, eob, aoff, loff, A, L);
   } else {
-    for (i = 16; i < 24; i += 4) {
-      const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV,
-                                 seg_eob, xd->block[i].qcoeff, TX_8X8);
-      xd->eobs[i] = c;
-      eobtotal += c;
+    for (pt = 0; pt < (1 << ss_tx_size); pt++) {
+      A[pt] = L[pt] = eob > 0;
     }
   }
-
-  return eobtotal;
+  arg->xd->plane[plane].eobs[block] = eob;
+  arg->eobtotal[0] += eob;
 }
 
-static int decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
-                            BOOL_DECODER* const bc,
-                            PLANE_TYPE type, int i, int seg_eob) {
-  const int c = decode_coefs(dx, xd, bc, i, type, seg_eob,
-                             xd->block[i].qcoeff, TX_4X4);
-  xd->eobs[i] = c;
-  return c;
-}
-
-int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
-                         BOOL_DECODER* const bc,
-                         PLANE_TYPE type, int i) {
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  const int seg_eob = get_eob(xd, segment_id, 16);
-
-  return decode_coefs_4x4(dx, xd, bc, type, i, seg_eob);
-}
-
-static int decode_mb_tokens_4x4_uv(VP9D_COMP* const dx,
-                                   MACROBLOCKD* const xd,
-                                   BOOL_DECODER* const bc,
-                                   int seg_eob) {
-  int i, eobtotal = 0;
-
-  // chroma blocks
-  for (i = 16; i < 24; i++)
-    eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_UV, i, seg_eob);
-
-  return eobtotal;
-}
-
-int vp9_decode_mb_tokens_4x4_uv(VP9D_COMP* const dx,
-                                MACROBLOCKD* const xd,
-                                BOOL_DECODER* const bc) {
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  const int seg_eob = get_eob(xd, segment_id, 16);
-
-  return decode_mb_tokens_4x4_uv(dx, xd, bc, seg_eob);
-}
-
-static int vp9_decode_mb_tokens_4x4(VP9D_COMP* const dx,
-                                    MACROBLOCKD* const xd,
-                                    BOOL_DECODER* const bc) {
-  int i, eobtotal = 0;
-  const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  const int seg_eob = get_eob(xd, segment_id, 16);
-
-  // luma blocks
-  for (i = 0; i < 16; ++i)
-    eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_Y_WITH_DC, i, seg_eob);
-
-  // chroma blocks
-  eobtotal += decode_mb_tokens_4x4_uv(dx, xd, bc, seg_eob);
-
-  return eobtotal;
-}
-
-int vp9_decode_mb_tokens(VP9D_COMP* const dx,
+int vp9_decode_tokens(VP9D_COMP* const pbi,
                          MACROBLOCKD* const xd,
-                         BOOL_DECODER* const bc) {
-  const TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
-  switch (tx_size) {
-    case TX_16X16:
-      return vp9_decode_mb_tokens_16x16(dx, xd, bc);
-    case TX_8X8:
-      return vp9_decode_mb_tokens_8x8(dx, xd, bc);
-    default:
-      assert(tx_size == TX_4X4);
-      return vp9_decode_mb_tokens_4x4(dx, xd, bc);
-  }
+                         vp9_reader *r,
+                         BLOCK_SIZE_TYPE bsize) {
+  int eobtotal = 0;
+  struct decode_block_args args = {pbi, xd, r, &eobtotal};
+  foreach_transformed_block(xd, bsize, decode_block, &args);
+  return eobtotal;
 }
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -14,22 +14,9 @@
 
 #include "vp9/decoder/vp9_onyxd_int.h"
 
-int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
-                         BOOL_DECODER* const bc,
-                         PLANE_TYPE type, int i);
-
-int vp9_decode_mb_tokens(VP9D_COMP* const, MACROBLOCKD* const,
-                         BOOL_DECODER* const);
-
-int vp9_decode_sb_tokens(VP9D_COMP* const pbi,
-                         MACROBLOCKD* const xd,
-                         BOOL_DECODER* const bc);
-
-int vp9_decode_sb64_tokens(VP9D_COMP* const pbi,
-                           MACROBLOCKD* const xd,
-                           BOOL_DECODER* const bc);
-
-int vp9_decode_mb_tokens_4x4_uv(VP9D_COMP* const dx, MACROBLOCKD* const xd,
-                                BOOL_DECODER* const bc);
+int vp9_decode_tokens(VP9D_COMP* const pbi,
+                      MACROBLOCKD* const xd,
+                      vp9_reader *r,
+                      BLOCK_SIZE_TYPE bsize);
 
 #endif  // VP9_DECODER_VP9_DETOKENIZE_H_
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -10,18 +10,15 @@
 
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_blockd.h"
-#include "vp9/decoder/vp9_dequantize.h"
+#include "vp9/decoder/vp9_idct_blk.h"
 
-void vp9_dequant_idct_add_y_block_4x4_inplace_c(int16_t *q,
-                                                const int16_t *dq,
-                                                uint8_t *dst,
-                                                int stride,
-                                                MACROBLOCKD *xd) {
+void vp9_idct_add_y_block_c(int16_t *q, uint8_t *dst, int stride,
+                            MACROBLOCKD *xd) {
   int i, j;
 
   for (i = 0; i < 4; i++) {
     for (j = 0; j < 4; j++) {
-      xd->itxm_add(q, dq, dst, dst, stride, stride, xd->eobs[i * 4 + j]);
+      vp9_idct_add(q, dst, stride, xd->plane[0].eobs[i * 4  + j]);
       q   += 16;
       dst += 4;
     }
@@ -30,202 +27,205 @@
   }
 }
 
-void vp9_dequant_idct_add_y_block_c(int16_t *q, const int16_t *dq,
-                                    uint8_t *pre,
-                                    uint8_t *dst,
-                                    int stride, MACROBLOCKD *xd) {
+void vp9_idct_add_uv_block_c(int16_t *q, uint8_t *dst, int stride,
+                             uint16_t *eobs) {
   int i, j;
 
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      vp9_dequant_idct_add(q, dq, pre, dst, 16, stride, xd->eobs[i * 4  + j]);
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      vp9_idct_add(q, dst, stride, eobs[i * 2 + j]);
       q   += 16;
-      pre += 4;
       dst += 4;
     }
 
-    pre += 64 - 16;
-    dst += 4 * stride - 16;
+    dst += 4 * stride - 8;
   }
 }
 
-void vp9_dequant_idct_add_uv_block_c(int16_t *q, const int16_t *dq,
-                                     uint8_t *pre, uint8_t *dstu,
-                                     uint8_t *dstv, int stride,
+void vp9_idct_add_y_block_8x8_c(int16_t *q, uint8_t *dst, int stride,
+                                MACROBLOCKD *xd) {
+  uint8_t *origdest = dst;
+
+  vp9_idct_add_8x8_c(q, dst, stride, xd->plane[0].eobs[0]);
+  vp9_idct_add_8x8_c(&q[64], origdest + 8, stride, xd->plane[0].eobs[4]);
+  vp9_idct_add_8x8_c(&q[128], origdest + 8 * stride, stride,
+                     xd->plane[0].eobs[8]);
+  vp9_idct_add_8x8_c(&q[192], origdest + 8 * stride + 8, stride,
+                     xd->plane[0].eobs[12]);
+}
+
+void vp9_idct_add_y_block_lossless_c(int16_t *q, uint8_t *dst, int stride,
                                      MACROBLOCKD *xd) {
   int i, j;
 
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 2; j++) {
-      vp9_dequant_idct_add(q, dq, pre, dstu, 8, stride,
-                           xd->eobs[16 + i * 2 + j]);
-      q    += 16;
-      pre  += 4;
-      dstu += 4;
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      vp9_idct_add_lossless_c(q, dst, stride, xd->plane[0].eobs[i * 4 + j]);
+      q   += 16;
+      dst += 4;
     }
 
-    pre  += 32 - 8;
-    dstu += 4 * stride - 8;
+    dst += 4 * stride - 16;
   }
-
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 2; j++) {
-      vp9_dequant_idct_add(q, dq, pre, dstv, 8, stride,
-                           xd->eobs[20 + i * 2 + j]);
-      q    += 16;
-      pre  += 4;
-      dstv += 4;
-    }
-
-    pre  += 32 - 8;
-    dstv += 4 * stride - 8;
-  }
 }
 
-void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
-                                                 uint8_t *dstu,
-                                                 uint8_t *dstv,
-                                                 int stride,
-                                                 MACROBLOCKD *xd) {
+void vp9_idct_add_uv_block_lossless_c(int16_t *q, uint8_t *dst, int stride,
+                                      uint16_t *eobs) {
   int i, j;
 
   for (i = 0; i < 2; i++) {
     for (j = 0; j < 2; j++) {
-      xd->itxm_add(q, dq, dstu, dstu, stride, stride, xd->eobs[16 + i * 2 + j]);
-      q    += 16;
-      dstu += 4;
+      vp9_idct_add_lossless_c(q, dst, stride, eobs[i * 2 + j]);
+      q   += 16;
+      dst += 4;
     }
 
-    dstu += 4 * stride - 8;
+    dst += 4 * stride - 8;
   }
+}
 
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 2; j++) {
-      xd->itxm_add(q, dq, dstv, dstv, stride, stride, xd->eobs[20 + i * 2 + j]);
-      q    += 16;
-      dstv += 4;
-    }
+static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
+                                  int width, int height) {
+  int r, c;
 
-    dstv += 4 * stride - 8;
+  for (r = 0; r < height; r++) {
+    for (c = 0; c < width; c++)
+      dest[c] = clip_pixel(diff + dest[c]);
+
+    dest += stride;
   }
 }
 
-void vp9_dequant_idct_add_y_block_8x8_inplace_c(int16_t *q,
-                                                const int16_t *dq,
-                                                uint8_t *dst,
-                                                int stride,
-                                                MACROBLOCKD *xd) {
-  vp9_dequant_idct_add_8x8_c(q, dq, dst, dst, stride, stride, xd->eobs[0]);
+void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest,
+                                     int stride) {
+  add_constant_residual(diff, dest, stride, 8, 8);
+}
 
-  vp9_dequant_idct_add_8x8_c(&q[64], dq, dst + 8,
-                             dst + 8, stride, stride, xd->eobs[4]);
+void vp9_add_constant_residual_16x16_c(const int16_t diff, uint8_t *dest,
+                                       int stride) {
+  add_constant_residual(diff, dest, stride, 16, 16);
+}
 
-  vp9_dequant_idct_add_8x8_c(&q[128], dq, dst + 8 * stride,
-                             dst + 8 * stride, stride, stride,
-                             xd->eobs[8]);
+void vp9_add_constant_residual_32x32_c(const int16_t diff,  uint8_t *dest,
+                                       int stride) {
+  add_constant_residual(diff, dest, stride, 32, 32);
+}
 
-  vp9_dequant_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8,
-                             dst + 8 * stride + 8, stride, stride,
-                             xd->eobs[12]);
+void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride,
+                   int eob) {
+  if (tx_type == DCT_DCT) {
+    vp9_idct_add(input, dest, stride, eob);
+  } else {
+    vp9_short_iht4x4_add(input, dest, stride, tx_type);
+    vpx_memset(input, 0, 32);
+  }
 }
 
-void vp9_dequant_idct_add_y_block_8x8_c(int16_t *q, const int16_t *dq,
-                                        uint8_t *pre,
-                                        uint8_t *dst,
-                                        int stride, MACROBLOCKD *xd) {
-  uint8_t *origdest = dst;
-  uint8_t *origpred = pre;
-
-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, xd->eobs[0]);
-  vp9_dequant_idct_add_8x8_c(&q[64], dq, origpred + 8,
-                             origdest + 8, 16, stride, xd->eobs[4]);
-  vp9_dequant_idct_add_8x8_c(&q[128], dq, origpred + 8 * 16,
-                             origdest + 8 * stride, 16, stride,
-                             xd->eobs[8]);
-  vp9_dequant_idct_add_8x8_c(&q[192], dq, origpred + 8 * 16 + 8,
-                             origdest + 8 * stride + 8, 16, stride,
-                             xd->eobs[12]);
+void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
+                       int stride, int eob) {
+  if (tx_type == DCT_DCT) {
+    vp9_idct_add_8x8(input, dest, stride, eob);
+  } else {
+    if (eob > 0) {
+      vp9_short_iht8x8_add(input, dest, stride, tx_type);
+      vpx_memset(input, 0, 128);
+    }
+  }
 }
 
-void vp9_dequant_idct_add_uv_block_8x8_c(int16_t *q, const int16_t *dq,
-                                         uint8_t *pre,
-                                         uint8_t *dstu,
-                                         uint8_t *dstv,
-                                         int stride, MACROBLOCKD *xd) {
-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride, xd->eobs[16]);
-
-  q    += 64;
-  pre  += 64;
-
-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride, xd->eobs[20]);
+void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {
+  if (eob > 1) {
+    vp9_short_idct4x4_add(input, dest, stride);
+    vpx_memset(input, 0, 32);
+  } else {
+    vp9_dc_only_idct_add(input[0], dest, dest, stride, stride);
+    ((int *)input)[0] = 0;
+  }
 }
 
-void vp9_dequant_idct_add_uv_block_8x8_inplace_c(int16_t *q, const int16_t *dq,
-                                                 uint8_t *dstu,
-                                                 uint8_t *dstv,
-                                                 int stride,
-                                                 MACROBLOCKD *xd) {
-  vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride,
-                             xd->eobs[16]);
-
-  q += 64;
-  vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride,
-                             xd->eobs[20]);
+void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest, int stride,
+                             int eob) {
+  if (eob > 1) {
+    vp9_short_iwalsh4x4_add(input, dest, stride);
+    vpx_memset(input, 0, 32);
+  } else {
+    vp9_short_iwalsh4x4_1_add_c(input, dest, stride);
+    ((int *)input)[0] = 0;
+  }
 }
 
+void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
+  // If dc is 1, then input[0] is the reconstructed value, do not need
+  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
 
-void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,
-                                             uint8_t *pre,
-                                             uint8_t *dst,
-                                             int stride, MACROBLOCKD *xd) {
-  int i, j;
+  // The calculation can be simplified if there are not many non-zero dct
+  // coefficients. Use eobs to decide what to do.
+  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
+  // Combine that with code here.
+  if (eob) {
+    if (eob == 1) {
+      // DC only DCT coefficient
+      int16_t in = input[0];
+      int16_t out;
 
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 4; j++) {
-      vp9_dequant_idct_add_lossless_c(q, dq, pre, dst, 16, stride,
-                                      xd->eobs[i * 4 + j]);
-      q   += 16;
-      pre += 4;
-      dst += 4;
+      // Note: the idct1 will need to be modified accordingly whenever
+      // vp9_short_idct8x8_c() is modified.
+      vp9_short_idct1_8x8_c(&in, &out);
+      input[0] = 0;
+
+      vp9_add_constant_residual_8x8(out, dest, stride);
+    } else {
+      vp9_short_idct8x8_add(input, dest, stride);
+      vpx_memset(input, 0, 128);
     }
+  }
+}
 
-    pre += 64 - 16;
-    dst += 4 * stride - 16;
+void vp9_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
+                         int stride, int eob) {
+  if (tx_type == DCT_DCT) {
+    vp9_idct_add_16x16(input, dest, stride, eob);
+  } else {
+    if (eob > 0) {
+      vp9_short_iht16x16_add(input, dest, stride, tx_type);
+      vpx_memset(input, 0, 512);
+    }
   }
 }
 
-void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq,
-                                              uint8_t *pre,
-                                              uint8_t *dstu,
-                                              uint8_t *dstv,
-                                              int stride,
-                                              MACROBLOCKD *xd) {
-  int i, j;
+void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) {
+  /* The calculation can be simplified if there are not many non-zero dct
+   * coefficients. Use eobs to separate different cases. */
+  if (eob) {
+    if (eob == 1) {
+      /* DC only DCT coefficient. */
+      int16_t in = input[0];
+      int16_t out;
+      /* Note: the idct1 will need to be modified accordingly whenever
+       * vp9_short_idct16x16() is modified. */
+      vp9_short_idct1_16x16_c(&in, &out);
+      input[0] = 0;
 
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 2; j++) {
-      vp9_dequant_idct_add_lossless_c(q, dq, pre, dstu, 8, stride,
-                                      xd->eobs[16 + i * 2 + j]);
-      q    += 16;
-      pre  += 4;
-      dstu += 4;
+      vp9_add_constant_residual_16x16(out, dest, stride);
+    } else {
+      vp9_short_idct16x16_add(input, dest, stride);
+      vpx_memset(input, 0, 512);
     }
-
-    pre  += 32 - 8;
-    dstu += 4 * stride - 8;
   }
+}
 
-  for (i = 0; i < 2; i++) {
-    for (j = 0; j < 2; j++) {
-      vp9_dequant_idct_add_lossless_c(q, dq, pre, dstv, 8, stride,
-                                      xd->eobs[20 + i * 2 + j]);
-      q    += 16;
-      pre  += 4;
-      dstv += 4;
-    }
+void vp9_idct_add_32x32_c(int16_t *input, uint8_t *dest, int stride, int eob) {
+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 1024);
 
-    pre  += 32 - 8;
-    dstv += 4 * stride - 8;
+  if (eob) {
+    if (eob == 1) {
+      vp9_short_idct1_32x32(input, output);
+      vp9_add_constant_residual_32x32(output[0], dest, stride);
+      input[0] = 0;
+    } else {
+      vp9_short_idct32x32_add(input, dest, stride);
+      vpx_memset(input, 0, 2048);
+    }
   }
 }
 
--- /dev/null
+++ b/vp9/decoder/vp9_idct_blk.h
@@ -1,0 +1,36 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_DECODER_VP9_IDCT_BLK_H_
+#define VP9_DECODER_VP9_IDCT_BLK_H_
+
+#include "vp9/common/vp9_blockd.h"
+
+
+void vp9_idct_add_lossless_c(int16_t *input, unsigned char *dest, int stride,
+                             int eob);
+
+void vp9_idct_add_y_block_lossless_c(int16_t *q, unsigned char *dst, int stride,
+                                     struct macroblockd *xd);
+
+void vp9_idct_add_uv_block_lossless_c(int16_t *q, unsigned char *dst,
+                                      int stride, uint16_t *eobs);
+
+void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest,
+                   int stride, int eob);
+
+void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest,
+                       int stride, int eob);
+
+void vp9_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input, unsigned char *dest,
+                         int stride, int eob);
+
+#endif  // VP9_DECODER_VP9_IDCT_BLK_H_
--- a/vp9/decoder/vp9_onyxd.h
+++ b/vp9/decoder/vp9_onyxd.h
@@ -11,54 +11,56 @@
 #ifndef VP9_COMMON_VP9_ONYXD_H_
 #define VP9_COMMON_VP9_ONYXD_H_
 
-/* Create/destroy static data structures. */
 #ifdef __cplusplus
 extern "C" {
 #endif
+
 #include "vpx_scale/yv12config.h"
 #include "vp9/common/vp9_ppflags.h"
-#include "vpx_ports/mem.h"
 #include "vpx/vpx_codec.h"
 
-  typedef void   *VP9D_PTR;
-  typedef struct {
-    int     Width;
-    int     Height;
-    int     Version;
-    int     postprocess;
-    int     max_threads;
-    int     inv_tile_order;
-    int     input_partition;
-  } VP9D_CONFIG;
-  typedef enum {
-    VP9_LAST_FLAG = 1,
-    VP9_GOLD_FLAG = 2,
-    VP9_ALT_FLAG = 4
-  } VP9_REFFRAME;
+typedef void *VP9D_PTR;
 
-  void vp9_initialize_dec(void);
+typedef struct {
+  int width;
+  int height;
+  int version;
+  int postprocess;
+  int max_threads;
+  int inv_tile_order;
+  int input_partition;
+} VP9D_CONFIG;
 
-  int vp9_receive_compressed_data(VP9D_PTR comp, unsigned long size,
-                                  const unsigned char **dest,
-                                  int64_t time_stamp);
+typedef enum {
+  VP9_LAST_FLAG = 1,
+  VP9_GOLD_FLAG = 2,
+  VP9_ALT_FLAG = 4
+} VP9_REFFRAME;
 
-  int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd,
-                        int64_t *time_stamp, int64_t *time_end_stamp,
-                        vp9_ppflags_t *flags);
+void vp9_initialize_dec();
 
-  vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR comp,
-                                         VP9_REFFRAME ref_frame_flag,
-                                         YV12_BUFFER_CONFIG *sd);
+int vp9_receive_compressed_data(VP9D_PTR comp,
+                                uint64_t size, const uint8_t **dest,
+                                int64_t time_stamp);
 
-  vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR comp,
-                                        VP9_REFFRAME ref_frame_flag,
-                                        YV12_BUFFER_CONFIG *sd);
+int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd,
+                      int64_t *time_stamp, int64_t *time_end_stamp,
+                      vp9_ppflags_t *flags);
 
-  int vp9_get_reference_dec(VP9D_PTR ptr, int index, YV12_BUFFER_CONFIG **fb);
+vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR comp,
+                                       VP9_REFFRAME ref_frame_flag,
+                                       YV12_BUFFER_CONFIG *sd);
 
-  VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf);
+vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR comp,
+                                      VP9_REFFRAME ref_frame_flag,
+                                      YV12_BUFFER_CONFIG *sd);
 
-  void vp9_remove_decompressor(VP9D_PTR comp);
+int vp9_get_reference_dec(VP9D_PTR ptr, int index, YV12_BUFFER_CONFIG **fb);
+
+
+VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf);
+
+void vp9_remove_decompressor(VP9D_PTR comp);
 
 #ifdef __cplusplus
 }
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -21,8 +21,6 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_loopfilter.h"
-#include "vp9/common/vp9_swapyv12buffer.h"
-
 #include "vp9/common/vp9_quant_common.h"
 #include "vpx_scale/vpx_scale.h"
 #include "vp9/common/vp9_systemdependent.h"
@@ -36,7 +34,7 @@
 static void recon_write_yuv_frame(const char *name,
                                   const YV12_BUFFER_CONFIG *s,
                                   int w, int _h) {
-  FILE *yuv_file = fopen((char *)name, "ab");
+  FILE *yuv_file = fopen(name, "ab");
   const uint8_t *src = s->y_buffer;
   int h = _h;
 
@@ -111,7 +109,7 @@
 }
 
 VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
-  VP9D_COMP *pbi = vpx_memalign(32, sizeof(VP9D_COMP));
+  VP9D_COMP *const pbi = vpx_memalign(32, sizeof(VP9D_COMP));
 
   if (!pbi)
     return NULL;
@@ -121,7 +119,7 @@
   if (setjmp(pbi->common.error.jmp)) {
     pbi->common.error.setjmp = 0;
     vp9_remove_decompressor(pbi);
-    return 0;
+    return NULL;
   }
 
   pbi->common.error.setjmp = 1;
@@ -128,33 +126,30 @@
   vp9_initialize_dec();
 
   vp9_create_common(&pbi->common);
-  pbi->oxcf = *oxcf;
 
+  pbi->oxcf = *oxcf;
   pbi->common.current_video_frame = 0;
   pbi->ready_for_new_data = 1;
 
-  /* vp9_init_de_quantizer() is first called here. Add check in
-   * frame_init_dequantizer() to avoid unnecessary calling of
-   * vp9_init_de_quantizer() for every frame.
-   */
-  vp9_init_de_quantizer(pbi);
+  // vp9_init_dequantizer() is first called here. Add check in
+  // frame_init_dequantizer() to avoid unnecessary calling of
+  // vp9_init_dequantizer() for every frame.
+  vp9_init_dequantizer(&pbi->common);
 
   vp9_loop_filter_init(&pbi->common);
 
   pbi->common.error.setjmp = 0;
-
   pbi->decoded_key_frame = 0;
 
-  return (VP9D_PTR) pbi;
+  return pbi;
 }
 
 void vp9_remove_decompressor(VP9D_PTR ptr) {
-  VP9D_COMP *pbi = (VP9D_COMP *) ptr;
+  VP9D_COMP *const pbi = (VP9D_COMP *)ptr;
 
   if (!pbi)
     return;
 
-  // Delete segmentation map
   if (pbi->common.last_frame_seg_map)
     vpx_free(pbi->common.last_frame_seg_map);
 
@@ -252,7 +247,7 @@
   return 0;
 }
 
-/* If any buffer updating is signalled it should be done here. */
+/* If any buffer updating is signaled it should be done here. */
 static void swap_frame_buffers(VP9D_COMP *pbi) {
   int ref_index = 0, mask;
 
@@ -273,24 +268,23 @@
     pbi->common.active_ref_idx[ref_index] = INT_MAX;
 }
 
-int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
-                                const unsigned char **psource,
+int vp9_receive_compressed_data(VP9D_PTR ptr,
+                                uint64_t size, const uint8_t **psource,
                                 int64_t time_stamp) {
   VP9D_COMP *pbi = (VP9D_COMP *) ptr;
   VP9_COMMON *cm = &pbi->common;
-  const unsigned char *source = *psource;
+  const uint8_t *source = *psource;
   int retcode = 0;
 
   /*if(pbi->ready_for_new_data == 0)
       return -1;*/
 
-  if (ptr == 0) {
+  if (ptr == 0)
     return -1;
-  }
 
   pbi->common.error.error_code = VPX_CODEC_OK;
 
-  pbi->Source = source;
+  pbi->source = source;
   pbi->source_sz = size;
 
   if (pbi->source_sz == 0) {
@@ -325,6 +319,7 @@
 
     if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
       cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+
     return -1;
   }
 
@@ -354,10 +349,20 @@
 
     if (cm->filter_level) {
       /* Apply the loop filter if appropriate. */
-      vp9_loop_filter_frame(cm, &pbi->mb, cm->filter_level, 0,
-                            cm->dering_enabled);
+      vp9_loop_filter_frame(cm, &pbi->mb, cm->filter_level, 0);
     }
-    vp8_yv12_extend_frame_borders(cm->frame_to_show);
+
+#if WRITE_RECON_BUFFER == 2
+    if (cm->show_frame)
+      write_dx_frame_to_file(cm->frame_to_show,
+                             cm->current_video_frame + 2000);
+    else
+      write_dx_frame_to_file(cm->frame_to_show,
+                             cm->current_video_frame + 3000);
+#endif
+
+    vp9_extend_frame_borders(cm->frame_to_show,
+                             cm->subsampling_x, cm->subsampling_y);
   }
 
 #if WRITE_RECON_BUFFER == 1
@@ -368,19 +373,19 @@
 
   vp9_clear_system_state();
 
+  cm->last_show_frame = cm->show_frame;
   if (cm->show_frame) {
-    vpx_memcpy(cm->prev_mip, cm->mip,
-               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
-  } else {
-    vpx_memset(cm->prev_mip, 0,
-               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
-  }
+    // current mip will be the prev_mip for the next frame
+    MODE_INFO *temp = cm->prev_mip;
+    cm->prev_mip = cm->mip;
+    cm->mip = temp;
 
-  /*vp9_print_modes_and_motion_vectors(cm->mi, cm->mb_rows,cm->mb_cols,
-                                       cm->current_video_frame);*/
+    // update the upper left visible macroblock ptrs
+    cm->mi = cm->mip + cm->mode_info_stride + 1;
+    cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
 
-  if (cm->show_frame)
     cm->current_video_frame++;
+  }
 
   pbi->ready_for_new_data = 0;
   pbi->last_time_stamp = time_stamp;
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@@ -14,7 +14,7 @@
 #include "vp9/decoder/vp9_onyxd.h"
 #include "vp9/decoder/vp9_treereader.h"
 #include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/decoder/vp9_dequantize.h"
+#include "vp9/decoder/vp9_idct_blk.h"
 
 // #define DEC_DEBUG
 
@@ -25,13 +25,12 @@
 
   VP9D_CONFIG oxcf;
 
+  const uint8_t *source;
+  uint32_t source_sz;
 
-  const unsigned char *Source;
-  unsigned int   source_sz;
-
   vp9_reader *mbc;
   int64_t last_time_stamp;
-  int   ready_for_new_data;
+  int ready_for_new_data;
 
   int refresh_frame_flags;
   vp9_prob prob_skip_false;
@@ -41,8 +40,6 @@
   int initial_width;
   int initial_height;
 } VP9D_COMP;
-
-int vp9_decode_frame(VP9D_COMP *cpi, const unsigned char **p_data_end);
 
 
 #if CONFIG_DEBUG
--- /dev/null
+++ b/vp9/decoder/vp9_read_bit_buffer.h
@@ -1,0 +1,54 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_READ_BIT_BUFFER_
+#define VP9_READ_BIT_BUFFER_
+
+#include <limits.h>
+
+#include "vpx/vpx_integer.h"
+
+typedef void (*vp9_rb_error_handler)(void *data, int bit_offset);
+
+struct vp9_read_bit_buffer {
+  const uint8_t *bit_buffer;
+  const uint8_t *bit_buffer_end;
+  size_t bit_offset;
+
+  void *error_handler_data;
+  vp9_rb_error_handler error_handler;
+};
+
+static size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) {
+  return rb->bit_offset / CHAR_BIT + (rb->bit_offset % CHAR_BIT > 0);
+}
+
+static int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) {
+  const int off = rb->bit_offset;
+  const int p = off / CHAR_BIT;
+  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+  if (rb->bit_buffer + p >= rb->bit_buffer_end) {
+    rb->error_handler(rb->error_handler_data, rb->bit_offset);
+    return 0;
+  } else {
+    const int bit = (rb->bit_buffer[p] & (1 << q)) >> q;
+    rb->bit_offset = off + 1;
+    return bit;
+  }
+}
+
+static int vp9_rb_read_literal(struct vp9_read_bit_buffer *rb, int bits) {
+  int value = 0, bit;
+  for (bit = bits - 1; bit >= 0; bit--)
+    value |= vp9_rb_read_bit(rb) << bit;
+  return value;
+}
+
+#endif  // VP9_READ_BIT_BUFFER_
--- a/vp9/decoder/vp9_treereader.h
+++ b/vp9/decoder/vp9_treereader.h
@@ -15,12 +15,8 @@
 #include "vp9/common/vp9_treecoder.h"
 #include "vp9/decoder/vp9_dboolhuff.h"
 
-typedef BOOL_DECODER vp9_reader;
-
-#define vp9_read decode_bool
-#define vp9_read_literal decode_value
-#define vp9_read_bit(r) vp9_read(r, vp9_prob_half)
 #define vp9_read_prob(r) ((vp9_prob)vp9_read_literal(r, 8))
+#define vp9_read_and_apply_sign(r, value) (vp9_read_bit(r) ? -(value) : (value))
 
 // Intent of tree data structure is to make decoding trivial.
 static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */
--- a/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c
@@ -15,249 +15,20 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-void vp9_add_residual_4x4_sse2(const int16_t *diff, const uint8_t *pred,
-                               int pitch, uint8_t *dest, int stride) {
-  const int width = 4;
-  const __m128i zero = _mm_setzero_si128();
-
-  // Diff data
-  const __m128i d0 = _mm_loadl_epi64((const __m128i *)(diff + 0 * width));
-  const __m128i d1 = _mm_loadl_epi64((const __m128i *)(diff + 1 * width));
-  const __m128i d2 = _mm_loadl_epi64((const __m128i *)(diff + 2 * width));
-  const __m128i d3 = _mm_loadl_epi64((const __m128i *)(diff + 3 * width));
-
-  // Prediction data.
-  __m128i p0 = _mm_cvtsi32_si128(*(const int *)(pred + 0 * pitch));
-  __m128i p1 = _mm_cvtsi32_si128(*(const int *)(pred + 1 * pitch));
-  __m128i p2 = _mm_cvtsi32_si128(*(const int *)(pred + 2 * pitch));
-  __m128i p3 = _mm_cvtsi32_si128(*(const int *)(pred + 3 * pitch));
-
-  p0 = _mm_unpacklo_epi8(p0, zero);
-  p1 = _mm_unpacklo_epi8(p1, zero);
-  p2 = _mm_unpacklo_epi8(p2, zero);
-  p3 = _mm_unpacklo_epi8(p3, zero);
-
-  p0 = _mm_add_epi16(p0, d0);
-  p1 = _mm_add_epi16(p1, d1);
-  p2 = _mm_add_epi16(p2, d2);
-  p3 = _mm_add_epi16(p3, d3);
-
-  p0 = _mm_packus_epi16(p0, p1);
-  p2 = _mm_packus_epi16(p2, p3);
-
-  *(int *)dest = _mm_cvtsi128_si32(p0);
-  dest += stride;
-
-  p0 = _mm_srli_si128(p0, 8);
-  *(int *)dest = _mm_cvtsi128_si32(p0);
-  dest += stride;
-
-  *(int *)dest = _mm_cvtsi128_si32(p2);
-  dest += stride;
-
-  p2 = _mm_srli_si128(p2, 8);
-  *(int *)dest = _mm_cvtsi128_si32(p2);
-}
-
-void vp9_add_residual_8x8_sse2(const int16_t *diff, const uint8_t *pred,
-                               int pitch, uint8_t *dest, int stride) {
-  const int width = 8;
-  const __m128i zero = _mm_setzero_si128();
-
-  // Diff data
-  const __m128i d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
-  const __m128i d1 = _mm_load_si128((const __m128i *)(diff + 1 * width));
-  const __m128i d2 = _mm_load_si128((const __m128i *)(diff + 2 * width));
-  const __m128i d3 = _mm_load_si128((const __m128i *)(diff + 3 * width));
-  const __m128i d4 = _mm_load_si128((const __m128i *)(diff + 4 * width));
-  const __m128i d5 = _mm_load_si128((const __m128i *)(diff + 5 * width));
-  const __m128i d6 = _mm_load_si128((const __m128i *)(diff + 6 * width));
-  const __m128i d7 = _mm_load_si128((const __m128i *)(diff + 7 * width));
-
-  // Prediction data.
-  __m128i p0 = _mm_loadl_epi64((const __m128i *)(pred + 0 * pitch));
-  __m128i p1 = _mm_loadl_epi64((const __m128i *)(pred + 1 * pitch));
-  __m128i p2 = _mm_loadl_epi64((const __m128i *)(pred + 2 * pitch));
-  __m128i p3 = _mm_loadl_epi64((const __m128i *)(pred + 3 * pitch));
-  __m128i p4 = _mm_loadl_epi64((const __m128i *)(pred + 4 * pitch));
-  __m128i p5 = _mm_loadl_epi64((const __m128i *)(pred + 5 * pitch));
-  __m128i p6 = _mm_loadl_epi64((const __m128i *)(pred + 6 * pitch));
-  __m128i p7 = _mm_loadl_epi64((const __m128i *)(pred + 7 * pitch));
-
-  p0 = _mm_unpacklo_epi8(p0, zero);
-  p1 = _mm_unpacklo_epi8(p1, zero);
-  p2 = _mm_unpacklo_epi8(p2, zero);
-  p3 = _mm_unpacklo_epi8(p3, zero);
-  p4 = _mm_unpacklo_epi8(p4, zero);
-  p5 = _mm_unpacklo_epi8(p5, zero);
-  p6 = _mm_unpacklo_epi8(p6, zero);
-  p7 = _mm_unpacklo_epi8(p7, zero);
-
-  p0 = _mm_add_epi16(p0, d0);
-  p1 = _mm_add_epi16(p1, d1);
-  p2 = _mm_add_epi16(p2, d2);
-  p3 = _mm_add_epi16(p3, d3);
-  p4 = _mm_add_epi16(p4, d4);
-  p5 = _mm_add_epi16(p5, d5);
-  p6 = _mm_add_epi16(p6, d6);
-  p7 = _mm_add_epi16(p7, d7);
-
-  p0 = _mm_packus_epi16(p0, p1);
-  p2 = _mm_packus_epi16(p2, p3);
-  p4 = _mm_packus_epi16(p4, p5);
-  p6 = _mm_packus_epi16(p6, p7);
-
-  _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0);
-  p0 = _mm_srli_si128(p0, 8);
-  _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0);
-
-  _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2);
-  p2 = _mm_srli_si128(p2, 8);
-  _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2);
-
-  _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4);
-  p4 = _mm_srli_si128(p4, 8);
-  _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4);
-
-  _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6);
-  p6 = _mm_srli_si128(p6, 8);
-  _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
-}
-
-void vp9_add_residual_16x16_sse2(const int16_t *diff, const uint8_t *pred,
-                             int pitch, uint8_t *dest, int stride) {
-  const int width = 16;
-  int i = 4;
-  const __m128i zero = _mm_setzero_si128();
-
-  // Diff data
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
-
-  do {
-    d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
-    d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));
-    d2 = _mm_load_si128((const __m128i *)(diff + 1 * width));
-    d3 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));
-    d4 = _mm_load_si128((const __m128i *)(diff + 2 * width));
-    d5 = _mm_load_si128((const __m128i *)(diff + 2 * width + 8));
-    d6 = _mm_load_si128((const __m128i *)(diff + 3 * width));
-    d7 = _mm_load_si128((const __m128i *)(diff + 3 * width + 8));
-
-    // Prediction data.
-    p1 = _mm_load_si128((const __m128i *)(pred + 0 * pitch));
-    p3 = _mm_load_si128((const __m128i *)(pred + 1 * pitch));
-    p5 = _mm_load_si128((const __m128i *)(pred + 2 * pitch));
-    p7 = _mm_load_si128((const __m128i *)(pred + 3 * pitch));
-
-    p0 = _mm_unpacklo_epi8(p1, zero);
-    p1 = _mm_unpackhi_epi8(p1, zero);
-    p2 = _mm_unpacklo_epi8(p3, zero);
-    p3 = _mm_unpackhi_epi8(p3, zero);
-    p4 = _mm_unpacklo_epi8(p5, zero);
-    p5 = _mm_unpackhi_epi8(p5, zero);
-    p6 = _mm_unpacklo_epi8(p7, zero);
-    p7 = _mm_unpackhi_epi8(p7, zero);
-
-    p0 = _mm_add_epi16(p0, d0);
-    p1 = _mm_add_epi16(p1, d1);
-    p2 = _mm_add_epi16(p2, d2);
-    p3 = _mm_add_epi16(p3, d3);
-    p4 = _mm_add_epi16(p4, d4);
-    p5 = _mm_add_epi16(p5, d5);
-    p6 = _mm_add_epi16(p6, d6);
-    p7 = _mm_add_epi16(p7, d7);
-
-    p0 = _mm_packus_epi16(p0, p1);
-    p1 = _mm_packus_epi16(p2, p3);
-    p2 = _mm_packus_epi16(p4, p5);
-    p3 = _mm_packus_epi16(p6, p7);
-
-    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
-    _mm_store_si128((__m128i *)(dest + 1 * stride), p1);
-    _mm_store_si128((__m128i *)(dest + 2 * stride), p2);
-    _mm_store_si128((__m128i *)(dest + 3 * stride), p3);
-
-    diff += 4 * width;
-    pred += 4 * pitch;
-    dest += 4 * stride;
-  } while (--i);
-}
-
-void vp9_add_residual_32x32_sse2(const int16_t *diff, const uint8_t *pred,
-                             int pitch, uint8_t *dest, int stride) {
-  const int width = 32;
-  int i = 16;
-  const __m128i zero = _mm_setzero_si128();
-
-  // Diff data
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
-
-  do {
-    d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
-    d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));
-    d2 = _mm_load_si128((const __m128i *)(diff + 0 * width + 16));
-    d3 = _mm_load_si128((const __m128i *)(diff + 0 * width + 24));
-    d4 = _mm_load_si128((const __m128i *)(diff + 1 * width));
-    d5 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));
-    d6 = _mm_load_si128((const __m128i *)(diff + 1 * width + 16));
-    d7 = _mm_load_si128((const __m128i *)(diff + 1 * width + 24));
-
-    // Prediction data.
-    p1 = _mm_load_si128((const __m128i *)(pred + 0 * pitch));
-    p3 = _mm_load_si128((const __m128i *)(pred + 0 * pitch + 16));
-    p5 = _mm_load_si128((const __m128i *)(pred + 1 * pitch));
-    p7 = _mm_load_si128((const __m128i *)(pred + 1 * pitch + 16));
-
-    p0 = _mm_unpacklo_epi8(p1, zero);
-    p1 = _mm_unpackhi_epi8(p1, zero);
-    p2 = _mm_unpacklo_epi8(p3, zero);
-    p3 = _mm_unpackhi_epi8(p3, zero);
-    p4 = _mm_unpacklo_epi8(p5, zero);
-    p5 = _mm_unpackhi_epi8(p5, zero);
-    p6 = _mm_unpacklo_epi8(p7, zero);
-    p7 = _mm_unpackhi_epi8(p7, zero);
-
-    p0 = _mm_add_epi16(p0, d0);
-    p1 = _mm_add_epi16(p1, d1);
-    p2 = _mm_add_epi16(p2, d2);
-    p3 = _mm_add_epi16(p3, d3);
-    p4 = _mm_add_epi16(p4, d4);
-    p5 = _mm_add_epi16(p5, d5);
-    p6 = _mm_add_epi16(p6, d6);
-    p7 = _mm_add_epi16(p7, d7);
-
-    p0 = _mm_packus_epi16(p0, p1);
-    p1 = _mm_packus_epi16(p2, p3);
-    p2 = _mm_packus_epi16(p4, p5);
-    p3 = _mm_packus_epi16(p6, p7);
-
-    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
-    _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1);
-    _mm_store_si128((__m128i *)(dest + 1 * stride), p2);
-    _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3);
-
-    diff += 2 * width;
-    pred += 2 * pitch;
-    dest += 2 * stride;
-  } while (--i);
-}
-
-void vp9_add_constant_residual_8x8_sse2(const int16_t diff, const uint8_t *pred,
-                                        int pitch, uint8_t *dest, int stride) {
+void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
+                                        int stride) {
   uint8_t abs_diff;
   __m128i d;
 
   // Prediction data.
-  __m128i p0 = _mm_loadl_epi64((const __m128i *)(pred + 0 * pitch));
-  __m128i p1 = _mm_loadl_epi64((const __m128i *)(pred + 1 * pitch));
-  __m128i p2 = _mm_loadl_epi64((const __m128i *)(pred + 2 * pitch));
-  __m128i p3 = _mm_loadl_epi64((const __m128i *)(pred + 3 * pitch));
-  __m128i p4 = _mm_loadl_epi64((const __m128i *)(pred + 4 * pitch));
-  __m128i p5 = _mm_loadl_epi64((const __m128i *)(pred + 5 * pitch));
-  __m128i p6 = _mm_loadl_epi64((const __m128i *)(pred + 6 * pitch));
-  __m128i p7 = _mm_loadl_epi64((const __m128i *)(pred + 7 * pitch));
+  __m128i p0 = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
+  __m128i p1 = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));
+  __m128i p2 = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));
+  __m128i p3 = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));
+  __m128i p4 = _mm_loadl_epi64((const __m128i *)(dest + 4 * stride));
+  __m128i p5 = _mm_loadl_epi64((const __m128i *)(dest + 5 * stride));
+  __m128i p6 = _mm_loadl_epi64((const __m128i *)(dest + 6 * stride));
+  __m128i p7 = _mm_loadl_epi64((const __m128i *)(dest + 7 * stride));
 
   p0 = _mm_unpacklo_epi64(p0, p1);
   p2 = _mm_unpacklo_epi64(p2, p3);
@@ -301,29 +72,28 @@
   _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
 }
 
-void vp9_add_constant_residual_16x16_sse2(const int16_t diff,
-                                          const uint8_t *pred, int pitch,
-                                          uint8_t *dest, int stride) {
+void vp9_add_constant_residual_16x16_sse2(const int16_t diff, uint8_t *dest,
+                                          int stride) {
   uint8_t abs_diff;
   __m128i d;
 
   // Prediction data.
-  __m128i p0 = _mm_load_si128((const __m128i *)(pred + 0 * pitch));
-  __m128i p1 = _mm_load_si128((const __m128i *)(pred + 1 * pitch));
-  __m128i p2 = _mm_load_si128((const __m128i *)(pred + 2 * pitch));
-  __m128i p3 = _mm_load_si128((const __m128i *)(pred + 3 * pitch));
-  __m128i p4 = _mm_load_si128((const __m128i *)(pred + 4 * pitch));
-  __m128i p5 = _mm_load_si128((const __m128i *)(pred + 5 * pitch));
-  __m128i p6 = _mm_load_si128((const __m128i *)(pred + 6 * pitch));
-  __m128i p7 = _mm_load_si128((const __m128i *)(pred + 7 * pitch));
-  __m128i p8 = _mm_load_si128((const __m128i *)(pred + 8 * pitch));
-  __m128i p9 = _mm_load_si128((const __m128i *)(pred + 9 * pitch));
-  __m128i p10 = _mm_load_si128((const __m128i *)(pred + 10 * pitch));
-  __m128i p11 = _mm_load_si128((const __m128i *)(pred + 11 * pitch));
-  __m128i p12 = _mm_load_si128((const __m128i *)(pred + 12 * pitch));
-  __m128i p13 = _mm_load_si128((const __m128i *)(pred + 13 * pitch));
-  __m128i p14 = _mm_load_si128((const __m128i *)(pred + 14 * pitch));
-  __m128i p15 = _mm_load_si128((const __m128i *)(pred + 15 * pitch));
+  __m128i p0 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
+  __m128i p1 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
+  __m128i p2 = _mm_load_si128((const __m128i *)(dest + 2 * stride));
+  __m128i p3 = _mm_load_si128((const __m128i *)(dest + 3 * stride));
+  __m128i p4 = _mm_load_si128((const __m128i *)(dest + 4 * stride));
+  __m128i p5 = _mm_load_si128((const __m128i *)(dest + 5 * stride));
+  __m128i p6 = _mm_load_si128((const __m128i *)(dest + 6 * stride));
+  __m128i p7 = _mm_load_si128((const __m128i *)(dest + 7 * stride));
+  __m128i p8 = _mm_load_si128((const __m128i *)(dest + 8 * stride));
+  __m128i p9 = _mm_load_si128((const __m128i *)(dest + 9 * stride));
+  __m128i p10 = _mm_load_si128((const __m128i *)(dest + 10 * stride));
+  __m128i p11 = _mm_load_si128((const __m128i *)(dest + 11 * stride));
+  __m128i p12 = _mm_load_si128((const __m128i *)(dest + 12 * stride));
+  __m128i p13 = _mm_load_si128((const __m128i *)(dest + 13 * stride));
+  __m128i p14 = _mm_load_si128((const __m128i *)(dest + 14 * stride));
+  __m128i p15 = _mm_load_si128((const __m128i *)(dest + 15 * stride));
 
   // Clip diff value to [0, 255] range. Then, do addition or subtraction
   // according to its sign.
@@ -388,9 +158,8 @@
   _mm_store_si128((__m128i *)(dest + 15 * stride), p15);
 }
 
-void vp9_add_constant_residual_32x32_sse2(const int16_t diff,
-                                          const uint8_t *pred, int pitch,
-                                          uint8_t *dest, int stride) {
+void vp9_add_constant_residual_32x32_sse2(const int16_t diff, uint8_t *dest,
+                                          int stride) {
   uint8_t abs_diff;
   __m128i d;
   int i = 8;
@@ -405,14 +174,14 @@
 
   do {
     // Prediction data.
-    __m128i p0 = _mm_load_si128((const __m128i *)(pred + 0 * pitch));
-    __m128i p1 = _mm_load_si128((const __m128i *)(pred + 0 * pitch + 16));
-    __m128i p2 = _mm_load_si128((const __m128i *)(pred + 1 * pitch));
-    __m128i p3 = _mm_load_si128((const __m128i *)(pred + 1 * pitch + 16));
-    __m128i p4 = _mm_load_si128((const __m128i *)(pred + 2 * pitch));
-    __m128i p5 = _mm_load_si128((const __m128i *)(pred + 2 * pitch + 16));
-    __m128i p6 = _mm_load_si128((const __m128i *)(pred + 3 * pitch));
-    __m128i p7 = _mm_load_si128((const __m128i *)(pred + 3 * pitch + 16));
+    __m128i p0 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
+    __m128i p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride + 16));
+    __m128i p2 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
+    __m128i p3 = _mm_load_si128((const __m128i *)(dest + 1 * stride + 16));
+    __m128i p4 = _mm_load_si128((const __m128i *)(dest + 2 * stride));
+    __m128i p5 = _mm_load_si128((const __m128i *)(dest + 2 * stride + 16));
+    __m128i p6 = _mm_load_si128((const __m128i *)(dest + 3 * stride));
+    __m128i p7 = _mm_load_si128((const __m128i *)(dest + 3 * stride + 16));
 
     // Clip diff value to [0, 255] range. Then, do addition or subtraction
     // according to its sign.
@@ -446,7 +215,6 @@
     _mm_store_si128((__m128i *)(dest + 3 * stride), p6);
     _mm_store_si128((__m128i *)(dest + 3 * stride + 16), p7);
 
-    pred += 4 * pitch;
     dest += 4 * stride;
   } while (--i);
 }
--- a/vp9/decoder/x86/vp9_idct_blk_sse2.c
+++ /dev/null
@@ -1,117 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "vp9/common/vp9_blockd.h"
-#include "vp9/decoder/vp9_dequantize.h"
-
-void vp9_idct_dequant_dc_0_2x_sse2(short *q, const short *dq,
-                                   unsigned char *pre, unsigned char *dst,
-                                   int dst_stride, const short *dc);
-
-void vp9_idct_dequant_dc_full_2x_sse2(short *q, const short *dq,
-                                      unsigned char *pre, unsigned char *dst,
-                                      int dst_stride, const short *dc);
-
-void vp9_idct_dequant_0_2x_sse2(short *q, const short *dq,
-                                unsigned char *pre, unsigned char *dst,
-                                int dst_stride, int blk_stride);
-
-void vp9_idct_dequant_full_2x_sse2(short *q, const short *dq,
-                                   unsigned char *pre, unsigned char *dst,
-                                   int dst_stride, int blk_stride);
-
-void vp9_dequant_dc_idct_add_y_block_sse2(short *q, const short *dq,
-                                          unsigned char *pre,
-                                          unsigned char *dst,
-                                          int stride, unsigned short *eobs,
-                                          const short *dc) {
-  int i;
-
-  for (i = 0; i < 4; i++) {
-    if (((short *)(eobs))[0] & 0xfefe)
-      vp9_idct_dequant_dc_full_2x_sse2(q, dq, pre, dst, stride, dc);
-    else
-      vp9_idct_dequant_dc_0_2x_sse2(q, dq, pre, dst, stride, dc);
-
-    if (((short *)(eobs))[1] & 0xfefe)
-      vp9_idct_dequant_dc_full_2x_sse2(q + 32, dq, pre + 8, dst + 8,
-                                       stride, dc + 2);
-    else
-      vp9_idct_dequant_dc_0_2x_sse2(q + 32, dq, pre + 8, dst + 8,
-                                    stride, dc + 2);
-
-    q    += 64;
-    dc   += 4;
-    pre  += 64;
-    dst  += stride * 4;
-    eobs += 4;
-  }
-}
-
-void vp9_dequant_idct_add_y_block_sse2(short *q, const short *dq,
-                                       unsigned char *pre, unsigned char *dst,
-                                       int stride, unsigned short *eobs) {
-  int i;
-
-  for (i = 0; i < 4; i++) {
-    if (((short *)(eobs))[0] & 0xfefe)
-      vp9_idct_dequant_full_2x_sse2(q, dq, pre, dst, stride, 16);
-    else
-      vp9_idct_dequant_0_2x_sse2(q, dq, pre, dst, stride, 16);
-
-    if (((short *)(eobs))[1] & 0xfefe)
-      vp9_idct_dequant_full_2x_sse2(q + 32, dq, pre + 8, dst + 8, stride, 16);
-    else
-      vp9_idct_dequant_0_2x_sse2(q + 32, dq, pre + 8, dst + 8, stride, 16);
-
-    q    += 64;
-    pre  += 64;
-    dst  += stride * 4;
-    eobs += 4;
-  }
-}
-
-void vp9_dequant_idct_add_uv_block_sse2(short *q, const short *dq,
-                                        unsigned char *pre,
-                                        unsigned char *dstu,
-                                        unsigned char *dstv,
-                                        int stride, unsigned short *eobs) {
-  if (((short *)(eobs))[0] & 0xfefe)
-    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstu, stride, 8);
-  else
-    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstu, stride, 8);
-
-  q    += 32;
-  pre  += 32;
-  dstu += stride * 4;
-
-  if (((short *)(eobs))[1] & 0xfefe)
-    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstu, stride, 8);
-  else
-    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstu, stride, 8);
-
-  q    += 32;
-  pre  += 32;
-
-  if (((short *)(eobs))[2] & 0xfefe)
-    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstv, stride, 8);
-  else
-    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstv, stride, 8);
-
-  q    += 32;
-  pre  += 32;
-  dstv += stride * 4;
-
-  if (((short *)(eobs))[3] & 0xfefe)
-    vp9_idct_dequant_full_2x_sse2(q, dq, pre, dstv, stride, 8);
-  else
-    vp9_idct_dequant_0_2x_sse2(q, dq, pre, dstv, stride, 8);
-}
--- a/vp9/encoder/ppc/vp9_csystemdependent.c
+++ /dev/null
@@ -1,155 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-
-SADFunction *vp9_sad16x16;
-SADFunction *vp9_sad16x8;
-SADFunction *vp9_sad8x16;
-SADFunction *vp9_sad8x8;
-SADFunction *vp9_sad4x4;
-
-variance_function *vp9_variance4x4;
-variance_function *vp9_variance8x8;
-variance_function *vp9_variance8x16;
-variance_function *vp9_variance16x8;
-variance_function *vp9_variance16x16;
-
-variance_function *vp9_mse16x16;
-
-sub_pixel_variance_function *vp9_sub_pixel_variance4x4;
-sub_pixel_variance_function *vp9_sub_pixel_variance8x8;
-sub_pixel_variance_function *vp9_sub_pixel_variance8x16;
-sub_pixel_variance_function *vp9_sub_pixel_variance16x8;
-sub_pixel_variance_function *vp9_sub_pixel_variance16x16;
-
-int (*vp9_block_error)(short *coeff, short *dqcoeff);
-int (*vp9_mbblock_error)(MACROBLOCK *mb, int dc);
-
-int (*vp9_mbuverror)(MACROBLOCK *mb);
-unsigned int (*vp9_get_mb_ss)(short *);
-void (*vp9_short_fdct4x4)(short *input, short *output, int pitch);
-void (*vp9_short_fdct8x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
-void (*short_walsh4x4)(short *input, short *output, int pitch);
-
-void (*vp9_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
-void (*vp9_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
-void (*vp9_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
-
-// c imports
-extern int block_error_c(short *coeff, short *dqcoeff);
-extern int vp9_mbblock_error_c(MACROBLOCK *mb, int dc);
-
-extern int vp9_mbuverror_c(MACROBLOCK *mb);
-extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern void short_fdct4x4_c(short *input, short *output, int pitch);
-extern void short_fdct8x4_c(short *input, short *output, int pitch);
-extern void vp9_short_walsh4x4_c(short *input, short *output, int pitch);
-
-extern void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
-extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
-extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
-
-extern SADFunction sad16x16_c;
-extern SADFunction sad16x8_c;
-extern SADFunction sad8x16_c;
-extern SADFunction sad8x8_c;
-extern SADFunction sad4x4_c;
-
-extern variance_function variance16x16_c;
-extern variance_function variance8x16_c;
-extern variance_function variance16x8_c;
-extern variance_function variance8x8_c;
-extern variance_function variance4x4_c;
-extern variance_function mse16x16_c;
-
-extern sub_pixel_variance_function sub_pixel_variance4x4_c;
-extern sub_pixel_variance_function sub_pixel_variance8x8_c;
-extern sub_pixel_variance_function sub_pixel_variance8x16_c;
-extern sub_pixel_variance_function sub_pixel_variance16x8_c;
-extern sub_pixel_variance_function sub_pixel_variance16x16_c;
-
-extern unsigned int vp9_get_mb_ss_c(short *);
-
-// ppc
-extern int vp9_block_error_ppc(short *coeff, short *dqcoeff);
-
-extern void vp9_short_fdct4x4_ppc(short *input, short *output, int pitch);
-extern void vp9_short_fdct8x4_ppc(short *input, short *output, int pitch);
-
-extern void vp9_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride);
-extern void vp9_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-
-extern SADFunction vp9_sad16x16_ppc;
-extern SADFunction vp9_sad16x8_ppc;
-extern SADFunction vp9_sad8x16_ppc;
-extern SADFunction vp9_sad8x8_ppc;
-extern SADFunction vp9_sad4x4_ppc;
-
-extern variance_function vp9_variance16x16_ppc;
-extern variance_function vp9_variance8x16_ppc;
-extern variance_function vp9_variance16x8_ppc;
-extern variance_function vp9_variance8x8_ppc;
-extern variance_function vp9_variance4x4_ppc;
-extern variance_function vp9_mse16x16_ppc;
-
-extern sub_pixel_variance_function vp9_sub_pixel_variance4x4_ppc;
-extern sub_pixel_variance_function vp9_sub_pixel_variance8x8_ppc;
-extern sub_pixel_variance_function vp9_sub_pixel_variance8x16_ppc;
-extern sub_pixel_variance_function vp9_sub_pixel_variance16x8_ppc;
-extern sub_pixel_variance_function vp9_sub_pixel_variance16x16_ppc;
-
-extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-
-void vp9_cmachine_specific_config(void) {
-  // Pure C:
-  vp9_mbuverror               = vp9_mbuverror_c;
-  vp8_fast_quantize_b           = vp8_fast_quantize_b_c;
-  vp9_short_fdct4x4            = vp9_short_fdct4x4_ppc;
-  vp9_short_fdct8x4            = vp9_short_fdct8x4_ppc;
-  vp8_fast_fdct4x4             = vp9_short_fdct4x4_ppc;
-  vp8_fast_fdct8x4             = vp9_short_fdct8x4_ppc;
-  short_walsh4x4               = vp9_short_walsh4x4_c;
-
-  vp9_variance4x4             = vp9_variance4x4_ppc;
-  vp9_variance8x8             = vp9_variance8x8_ppc;
-  vp9_variance8x16            = vp9_variance8x16_ppc;
-  vp9_variance16x8            = vp9_variance16x8_ppc;
-  vp9_variance16x16           = vp9_variance16x16_ppc;
-  vp9_mse16x16                = vp9_mse16x16_ppc;
-
-  vp9_sub_pixel_variance4x4     = vp9_sub_pixel_variance4x4_ppc;
-  vp9_sub_pixel_variance8x8     = vp9_sub_pixel_variance8x8_ppc;
-  vp9_sub_pixel_variance8x16    = vp9_sub_pixel_variance8x16_ppc;
-  vp9_sub_pixel_variance16x8    = vp9_sub_pixel_variance16x8_ppc;
-  vp9_sub_pixel_variance16x16   = vp9_sub_pixel_variance16x16_ppc;
-
-  vp9_get_mb_ss                 = vp9_get_mb_ss_c;
-
-  vp9_sad16x16                = vp9_sad16x16_ppc;
-  vp9_sad16x8                 = vp9_sad16x8_ppc;
-  vp9_sad8x16                 = vp9_sad8x16_ppc;
-  vp9_sad8x8                  = vp9_sad8x8_ppc;
-  vp9_sad4x4                  = vp9_sad4x4_ppc;
-
-  vp9_block_error              = vp9_block_error_ppc;
-  vp9_mbblock_error            = vp9_mbblock_error_c;
-
-  vp9_subtract_b               = vp9_subtract_b_c;
-  vp9_subtract_mby             = vp9_subtract_mby_ppc;
-  vp9_subtract_mbuv            = vp9_subtract_mbuv_ppc;
-}
--- a/vp9/encoder/ppc/vp9_encodemb_altivec.asm
+++ /dev/null
@@ -1,153 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_subtract_mbuv_ppc
-    .globl vp8_subtract_mby_ppc
-
-;# r3 short *diff
-;# r4 unsigned char *usrc
-;# r5 unsigned char *vsrc
-;# r6 unsigned char *pred
-;# r7 int stride
-vp8_subtract_mbuv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf000
-    mtspr   256, r12            ;# set VRSAVE
-
-    li      r9, 256
-    add     r3, r3, r9
-    add     r3, r3, r9
-    add     r6, r6, r9
-
-    li      r10, 16
-    li      r9,  4
-    mtctr   r9
-
-    vspltisw v0, 0
-
-mbu_loop:
-    lvsl    v5, 0, r4           ;# permutate value for alignment
-    lvx     v1, 0, r4           ;# src
-    lvx     v2, 0, r6           ;# pred
-
-    add     r4, r4, r7
-    addi    r6, r6, 16
-
-    vperm   v1, v1, v0, v5
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrghb  v4, v0, v2          ;# unpack high pred to short
-
-    lvsl    v5, 0, r4           ;# permutate value for alignment
-    lvx     v1, 0, r4           ;# src
-
-    add     r4, r4, r7
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, 0, r3           ;# store out diff
-
-    vperm   v1, v1, v0, v5
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrglb  v4, v0, v2          ;# unpack high pred to short
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, r10, r3         ;# store out diff
-
-    addi    r3, r3, 32
-
-    bdnz    mbu_loop
-
-    mtctr   r9
-
-mbv_loop:
-    lvsl    v5, 0, r5           ;# permutate value for alignment
-    lvx     v1, 0, r5           ;# src
-    lvx     v2, 0, r6           ;# pred
-
-    add     r5, r5, r7
-    addi    r6, r6, 16
-
-    vperm   v1, v1, v0, v5
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrghb  v4, v0, v2          ;# unpack high pred to short
-
-    lvsl    v5, 0, r5           ;# permutate value for alignment
-    lvx     v1, 0, r5           ;# src
-
-    add     r5, r5, r7
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, 0, r3           ;# store out diff
-
-    vperm   v1, v1, v0, v5
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrglb  v4, v0, v2          ;# unpack high pred to short
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, r10, r3         ;# store out diff
-
-    addi    r3, r3, 32
-
-    bdnz    mbv_loop
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# r3 short *diff
-;# r4 unsigned char *src
-;# r5 unsigned char *pred
-;# r6 int stride
-vp8_subtract_mby_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf800
-    mtspr   256, r12            ;# set VRSAVE
-
-    li      r10, 16
-    mtctr   r10
-
-    vspltisw v0, 0
-
-mby_loop:
-    lvx     v1, 0, r4           ;# src
-    lvx     v2, 0, r5           ;# pred
-
-    add     r4, r4, r6
-    addi    r5, r5, 16
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrghb  v4, v0, v2          ;# unpack high pred to short
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, 0, r3           ;# store out diff
-
-    vmrglb  v3, v0, v1          ;# unpack low src  to short
-    vmrglb  v4, v0, v2          ;# unpack low pred to short
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, r10, r3         ;# store out diff
-
-    addi    r3, r3, 32
-
-    bdnz    mby_loop
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
--- a/vp9/encoder/ppc/vp9_fdct_altivec.asm
+++ /dev/null
@@ -1,205 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_short_fdct4x4_ppc
-    .globl vp8_short_fdct8x4_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-;# Forward and inverse DCTs are nearly identical; only differences are
-;#   in normalization (fwd is twice unitary, inv is half unitary)
-;#   and that they are of course transposes of each other.
-;#
-;#   The following three accomplish most of implementation and
-;#   are used only by ppc_idct.c and ppc_fdct.c.
-.macro prologue
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfffc
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    li      r6, 16
-
-    load_c v0, dct_tab, 0, r9, r10
-    lvx     v1,   r6, r10
-    addi    r10, r10, 32
-    lvx     v2,    0, r10
-    lvx     v3,   r6, r10
-
-    load_c v4, ppc_dctperm_tab,  0, r9, r10
-    load_c v5, ppc_dctperm_tab, r6, r9, r10
-
-    load_c v6, round_tab, 0, r10, r9
-.endm
-
-.macro epilogue
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-.endm
-
-;# Do horiz xf on two rows of coeffs  v8 = a0 a1 a2 a3  b0 b1 b2 b3.
-;#   a/A are the even rows 0,2   b/B are the odd rows 1,3
-;#   For fwd transform, indices are horizontal positions, then frequencies.
-;#   For inverse transform, frequencies then positions.
-;#   The two resulting  A0..A3  B0..B3  are later combined
-;#   and vertically transformed.
-
-.macro two_rows_horiz Dst
-    vperm   v9, v8, v8, v4      ;# v9 = a2 a3 a0 a1  b2 b3 b0 b1
-
-    vmsumshm v10, v0, v8, v6
-    vmsumshm v10, v1, v9, v10
-    vsraw   v10, v10, v7        ;# v10 = A0 A1  B0 B1
-
-    vmsumshm v11, v2, v8, v6
-    vmsumshm v11, v3, v9, v11
-    vsraw   v11, v11, v7        ;# v11 = A2 A3  B2 B3
-
-    vpkuwum v10, v10, v11       ;# v10  = A0 A1  B0 B1  A2 A3  B2 B3
-    vperm   \Dst, v10, v10, v5  ;# Dest = A0 B0  A1 B1  A2 B2  A3 B3
-.endm
-
-;# Vertical xf on two rows. DCT values in comments are for inverse transform;
-;#   forward transform uses transpose.
-
-.macro two_rows_vert Ceven, Codd
-    vspltw  v8, \Ceven, 0       ;# v8 = c00 c10  or  c02 c12 four times
-    vspltw  v9, \Codd,  0       ;# v9 = c20 c30  or  c22 c32 ""
-    vmsumshm v8, v8, v12, v6
-    vmsumshm v8, v9, v13, v8
-    vsraw   v10, v8, v7
-
-    vspltw  v8, \Codd,  1       ;# v8 = c01 c11  or  c03 c13
-    vspltw  v9, \Ceven, 1       ;# v9 = c21 c31  or  c23 c33
-    vmsumshm v8, v8, v12, v6
-    vmsumshm v8, v9, v13, v8
-    vsraw   v8, v8, v7
-
-    vpkuwum v8, v10, v8         ;# v8 = rows 0,1  or 2,3
-.endm
-
-.macro two_rows_h Dest
-    stw     r0,  0(r8)
-    lwz     r0,  4(r3)
-    stw     r0,  4(r8)
-    lwzux   r0, r3,r5
-    stw     r0,  8(r8)
-    lwz     r0,  4(r3)
-    stw     r0, 12(r8)
-    lvx     v8,  0,r8
-    two_rows_horiz \Dest
-.endm
-
-    .align 2
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-vp8_short_fdct4x4_ppc:
-
-    prologue
-
-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
-    addi    r8, r1, 0
-
-
-    lwz     r0, 0(r3)
-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
-
-    lwzux   r0, r3, r5
-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
-
-    lvx     v6, r6, r9          ;# v6 = Vround
-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
-
-    two_rows_vert v0, v1
-    stvx    v8, 0, r4
-    two_rows_vert v2, v3
-    stvx    v8, r6, r4
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-vp8_short_fdct8x4_ppc:
-    prologue
-
-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
-    addi    r8,  r1, 0
-    addi    r10, r3, 0
-
-    lwz     r0, 0(r3)
-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
-
-    lwzux   r0, r3, r5
-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
-
-    lvx     v6, r6, r9          ;# v6 = Vround
-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
-
-    two_rows_vert v0, v1
-    stvx    v8, 0, r4
-    two_rows_vert v2, v3
-    stvx    v8, r6, r4
-
-    ;# Next block
-    addi    r3, r10, 8
-    addi    r4, r4, 32
-    lvx     v6, 0, r9           ;# v6 = Hround
-
-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
-    addi    r8, r1, 0
-
-    lwz     r0, 0(r3)
-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
-
-    lwzux   r0, r3, r5
-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
-
-    lvx     v6, r6, r9          ;# v6 = Vround
-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
-
-    two_rows_vert v0, v1
-    stvx    v8, 0, r4
-    two_rows_vert v2, v3
-    stvx    v8, r6, r4
-
-    epilogue
-
-    blr
-
-    .data
-    .align 4
-ppc_dctperm_tab:
-    .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
-    .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
-
-    .align 4
-dct_tab:
-    .short  23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
-    .short  23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
-
-    .short  23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
-    .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
-
-    .align 4
-round_tab:
-    .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
-    .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))
--- a/vp9/encoder/ppc/vp9_rdopt_altivec.asm
+++ /dev/null
@@ -1,51 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_block_error_ppc
-
-    .align 2
-;# r3 short *Coeff
-;# r4 short *dqcoeff
-vp8_block_error_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf800
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    stw     r5, 12(r1)          ;# tranfer dc to vector register
-
-    lvx     v0, 0, r3           ;# Coeff
-    lvx     v1, 0, r4           ;# dqcoeff
-
-    li      r10, 16
-
-    vspltisw v3, 0
-
-    vsubshs v0, v0, v1
-
-    vmsumshm v2, v0, v0, v3     ;# multiply differences
-
-    lvx     v0, r10, r3         ;# Coeff
-    lvx     v1, r10, r4         ;# dqcoeff
-
-    vsubshs v0, v0, v1
-
-    vmsumshm v1, v0, v0, v2     ;# multiply differences
-    vsumsws v1, v1, v3          ;# sum up
-
-    stvx    v1, 0, r1
-    lwz     r3, 12(r1)          ;# return value
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
--- a/vp9/encoder/ppc/vp9_sad_altivec.asm
+++ /dev/null
@@ -1,277 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_sad16x16_ppc
-    .globl vp8_sad16x8_ppc
-    .globl vp8_sad8x16_ppc
-    .globl vp8_sad8x8_ppc
-    .globl vp8_sad4x4_ppc
-
-.macro load_aligned_16 V R O
-    lvsl    v3,  0, \R          ;# permutate value for alignment
-
-    lvx     v1,  0, \R
-    lvx     v2, \O, \R
-
-    vperm   \V, v1, v2, v3
-.endm
-
-.macro prologue
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1, -32(r1)         ;# create space on the stack
-
-    li      r10, 16             ;# load offset and loop counter
-
-    vspltisw v8, 0              ;# zero out total to start
-.endm
-
-.macro epilogue
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-.endm
-
-.macro SAD_16
-    ;# v6 = abs (v4 - v5)
-    vsububs v6, v4, v5
-    vsububs v7, v5, v4
-    vor     v6, v6, v7
-
-    ;# v8 += abs (v4 - v5)
-    vsum4ubs v8, v6, v8
-.endm
-
-.macro sad_16_loop loop_label
-    lvsl    v3,  0, r5          ;# only needs to be done once per block
-
-    ;# preload a line of data before getting into the loop
-    lvx     v4, 0, r3
-    lvx     v1,  0, r5
-    lvx     v2, r10, r5
-
-    add     r5, r5, r6
-    add     r3, r3, r4
-
-    vperm   v5, v1, v2, v3
-
-    .align 4
-\loop_label:
-    ;# compute difference on first row
-    vsububs v6, v4, v5
-    vsububs v7, v5, v4
-
-    ;# load up next set of data
-    lvx     v9, 0, r3
-    lvx     v1,  0, r5
-    lvx     v2, r10, r5
-
-    ;# perform abs() of difference
-    vor     v6, v6, v7
-    add     r3, r3, r4
-
-    ;# add to the running tally
-    vsum4ubs v8, v6, v8
-
-    ;# now onto the next line
-    vperm   v5, v1, v2, v3
-    add     r5, r5, r6
-    lvx     v4, 0, r3
-
-    ;# compute difference on second row
-    vsububs v6, v9, v5
-    lvx     v1,  0, r5
-    vsububs v7, v5, v9
-    lvx     v2, r10, r5
-    vor     v6, v6, v7
-    add     r3, r3, r4
-    vsum4ubs v8, v6, v8
-    vperm   v5, v1, v2, v3
-    add     r5, r5, r6
-
-    bdnz    \loop_label
-
-    vspltisw v7, 0
-
-    vsumsws v8, v8, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-.endm
-
-.macro sad_8_loop loop_label
-    .align 4
-\loop_label:
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v4, r3, r10
-    load_aligned_16 v5, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v6, r3, r10
-    load_aligned_16 v7, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    vmrghb  v4, v4, v6
-    vmrghb  v5, v5, v7
-
-    SAD_16
-
-    bdnz    \loop_label
-
-    vspltisw v7, 0
-
-    vsumsws v8, v8, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-.endm
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad16x16_ppc:
-
-    prologue
-
-    li      r9, 8
-    mtctr   r9
-
-    sad_16_loop sad16x16_loop
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad16x8_ppc:
-
-    prologue
-
-    li      r9, 4
-    mtctr   r9
-
-    sad_16_loop sad16x8_loop
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad8x16_ppc:
-
-    prologue
-
-    li      r9, 8
-    mtctr   r9
-
-    sad_8_loop sad8x16_loop
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad8x8_ppc:
-
-    prologue
-
-    li      r9, 4
-    mtctr   r9
-
-    sad_8_loop sad8x8_loop
-
-    epilogue
-
-    blr
-
-.macro transfer_4x4 I P
-    lwz     r0, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r7, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r8, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r9, 0(\I)
-
-    stw     r0,  0(r1)
-    stw     r7,  4(r1)
-    stw     r8,  8(r1)
-    stw     r9, 12(r1)
-.endm
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad4x4_ppc:
-
-    prologue
-
-    transfer_4x4 r3, r4
-    lvx     v4, 0, r1
-
-    transfer_4x4 r5, r6
-    lvx     v5, 0, r1
-
-    vspltisw v8, 0              ;# zero out total to start
-
-    ;# v6 = abs (v4 - v5)
-    vsububs v6, v4, v5
-    vsububs v7, v5, v4
-    vor     v6, v6, v7
-
-    ;# v8 += abs (v4 - v5)
-    vsum4ubs v7, v6, v8
-    vsumsws v7, v7, v8
-
-    stvx    v7, 0, r1
-    lwz     r3, 12(r1)
-
-    epilogue
-
-    blr
--- a/vp9/encoder/ppc/vp9_variance_altivec.asm
+++ /dev/null
@@ -1,375 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_get8x8var_ppc
-    .globl vp8_get16x16var_ppc
-    .globl vp8_mse16x16_ppc
-    .globl vp9_variance16x16_ppc
-    .globl vp9_variance16x8_ppc
-    .globl vp9_variance8x16_ppc
-    .globl vp9_variance8x8_ppc
-    .globl vp9_variance4x4_ppc
-
-.macro load_aligned_16 V R O
-    lvsl    v3,  0, \R          ;# permutate value for alignment
-
-    lvx     v1,  0, \R
-    lvx     v2, \O, \R
-
-    vperm   \V, v1, v2, v3
-.endm
-
-.macro prologue
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1, -32(r1)         ;# create space on the stack
-
-    li      r10, 16             ;# load offset and loop counter
-
-    vspltisw v7, 0              ;# zero for merging
-    vspltisw v8, 0              ;# zero out total to start
-    vspltisw v9, 0              ;# zero out total for dif^2
-.endm
-
-.macro epilogue
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-.endm
-
-.macro compute_sum_sse
-    ;# Compute sum first.  Unpack to so signed subract
-    ;#  can be used.  Only have a half word signed
-    ;#  subract.  Do high, then low.
-    vmrghb  v2, v7, v4
-    vmrghb  v3, v7, v5
-    vsubshs v2, v2, v3
-    vsum4shs v8, v2, v8
-
-    vmrglb  v2, v7, v4
-    vmrglb  v3, v7, v5
-    vsubshs v2, v2, v3
-    vsum4shs v8, v2, v8
-
-    ;# Now compute sse.
-    vsububs v2, v4, v5
-    vsububs v3, v5, v4
-    vor     v2, v2, v3
-
-    vmsumubm v9, v2, v2, v9
-.endm
-
-.macro variance_16 DS loop_label store_sum
-\loop_label:
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v4, r3, r10
-    load_aligned_16 v5, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    compute_sum_sse
-
-    bdnz    \loop_label
-
-    vsumsws v8, v8, v7
-    vsumsws v9, v9, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    v9, 0, r1
-    lwz     r4, 12(r1)
-
-.if \store_sum
-    stw     r3, 0(r8)           ;# sum
-.endif
-    stw     r4, 0(r7)           ;# sse
-
-    mullw   r3, r3, r3          ;# sum*sum
-    srawi   r3, r3, \DS         ;# (sum*sum) >> DS
-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> DS)
-.endm
-
-.macro variance_8 DS loop_label store_sum
-\loop_label:
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v4, r3, r10
-    load_aligned_16 v5, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v6, r3, r10
-    load_aligned_16 v0, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    vmrghb  v4, v4, v6
-    vmrghb  v5, v5, v0
-
-    compute_sum_sse
-
-    bdnz    \loop_label
-
-    vsumsws v8, v8, v7
-    vsumsws v9, v9, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    v9, 0, r1
-    lwz     r4, 12(r1)
-
-.if \store_sum
-    stw     r3, 0(r8)           ;# sum
-.endif
-    stw     r4, 0(r7)           ;# sse
-
-    mullw   r3, r3, r3          ;# sum*sum
-    srawi   r3, r3, \DS         ;# (sum*sum) >> 8
-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
-.endm
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *SSE
-;# r8 int *Sum
-;#
-;# r3 return value
-vp8_get8x8var_ppc:
-
-    prologue
-
-    li      r9, 4
-    mtctr   r9
-
-    variance_8 6, get8x8var_loop, 1
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *SSE
-;# r8 int *Sum
-;#
-;# r3 return value
-vp8_get16x16var_ppc:
-
-    prologue
-
-    mtctr   r10
-
-    variance_16 8, get16x16var_loop, 1
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r 3 return value
-vp8_mse16x16_ppc:
-    prologue
-
-    mtctr   r10
-
-mse16x16_loop:
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v4, r3, r10
-    load_aligned_16 v5, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    ;# Now compute sse.
-    vsububs v2, v4, v5
-    vsububs v3, v5, v4
-    vor     v2, v2, v3
-
-    vmsumubm v9, v2, v2, v9
-
-    bdnz    mse16x16_loop
-
-    vsumsws v9, v9, v7
-
-    stvx    v9, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    v9, 0, r1
-    lwz     r3, 12(r1)
-
-    stw     r3, 0(r7)           ;# sse
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp9_variance16x16_ppc:
-
-    prologue
-
-    mtctr   r10
-
-    variance_16 8, variance16x16_loop, 0
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp9_variance16x8_ppc:
-
-    prologue
-
-    li      r9, 8
-    mtctr   r9
-
-    variance_16 7, variance16x8_loop, 0
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp9_variance8x16_ppc:
-
-    prologue
-
-    li      r9, 8
-    mtctr   r9
-
-    variance_8 7, variance8x16_loop, 0
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp9_variance8x8_ppc:
-
-    prologue
-
-    li      r9, 4
-    mtctr   r9
-
-    variance_8 6, variance8x8_loop, 0
-
-    epilogue
-
-    blr
-
-.macro transfer_4x4 I P
-    lwz     r0, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r10,0(\I)
-    add     \I, \I, \P
-
-    lwz     r8, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r9, 0(\I)
-
-    stw     r0,  0(r1)
-    stw     r10, 4(r1)
-    stw     r8,  8(r1)
-    stw     r9, 12(r1)
-.endm
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp9_variance4x4_ppc:
-
-    prologue
-
-    transfer_4x4 r3, r4
-    lvx     v4, 0, r1
-
-    transfer_4x4 r5, r6
-    lvx     v5, 0, r1
-
-    compute_sum_sse
-
-    vsumsws v8, v8, v7
-    vsumsws v9, v9, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    v9, 0, r1
-    lwz     r4, 12(r1)
-
-    stw     r4, 0(r7)           ;# sse
-
-    mullw   r3, r3, r3          ;# sum*sum
-    srawi   r3, r3, 4           ;# (sum*sum) >> 4
-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 4)
-
-    epilogue
-
-    blr
--- a/vp9/encoder/ppc/vp9_variance_subpixel_altivec.asm
+++ /dev/null
@@ -1,865 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp9_sub_pixel_variance4x4_ppc
-    .globl vp9_sub_pixel_variance8x8_ppc
-    .globl vp9_sub_pixel_variance8x16_ppc
-    .globl vp9_sub_pixel_variance16x8_ppc
-    .globl vp9_sub_pixel_variance16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-.macro load_vfilter V0, V1
-    load_c \V0, vfilter_b, r6, r12, r10
-
-    addi    r6,  r6, 16
-    lvx     \V1, r6, r10
-.endm
-
-.macro HProlog jump_label
-    ;# load up horizontal filter
-    slwi.   r5, r5, 4           ;# index into horizontal filter array
-
-    ;# index to the next set of vectors in the row.
-    li      r10, 16
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq     \jump_label
-
-    load_c v20, hfilter_b, r5, r12, r0
-
-    ;# setup constants
-    ;# v14 permutation value for alignment
-    load_c v28, b_hperm_b, 0, r12, r0
-
-    ;# index to the next set of vectors in the row.
-    li      r12, 32
-
-    ;# rounding added in on the multiply
-    vspltisw v21, 8
-    vspltisw v18, 3
-    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
-
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-.endm
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-
-.macro hfilter_8 V, hp, lp, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 9 bytes wide, output is 8 bytes.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-
-    vperm   v24, v21, v21, \hp  ;# v20 = 0123 1234 2345 3456
-    vperm   v25, v21, v21, \lp  ;# v21 = 4567 5678 6789 789A
-
-    vmsummbm v24, v20, v24, v18
-    vmsummbm v25, v20, v25, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-
-    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
-.endm
-
-.macro vfilter_16 P0 P1
-    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
-    vadduhm v22, v18, v22
-    vmuloub v23, \P0, v20
-    vadduhm v23, v18, v23
-
-    vmuleub v24, \P1, v21
-    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
-    vmuloub v25, \P1, v21
-    vadduhm v23, v23, v25       ;# Ro = odds
-
-    vsrh    v22, v22, v19       ;# divide by 128
-    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
-    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
-    vmrglh  v23, v22, v23
-    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
-.endm
-
-.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
-    ;# Compute sum first.  Unpack to so signed subract
-    ;#  can be used.  Only have a half word signed
-    ;#  subract.  Do high, then low.
-    vmrghb  \t1, \z0, \src
-    vmrghb  \t2, \z0, \ref
-    vsubshs \t1, \t1, \t2
-    vsum4shs \sum, \t1, \sum
-
-    vmrglb  \t1, \z0, \src
-    vmrglb  \t2, \z0, \ref
-    vsubshs \t1, \t1, \t2
-    vsum4shs \sum, \t1, \sum
-
-    ;# Now compute sse.
-    vsububs \t1, \src, \ref
-    vsububs \t2, \ref, \src
-    vor     \t1, \t1, \t2
-
-    vmsumubm \sse, \t1, \t1, \sse
-.endm
-
-.macro variance_final sum, sse, z0, DS
-    vsumsws \sum, \sum, \z0
-    vsumsws \sse, \sse, \z0
-
-    stvx    \sum, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    \sse, 0, r1
-    lwz     r4, 12(r1)
-
-    stw     r4, 0(r9)           ;# sse
-
-    mullw   r3, r3, r3          ;# sum*sum
-    srawi   r3, r3, \DS         ;# (sum*sum) >> 8
-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
-.endm
-
-.macro compute_sum_sse_16 V, increment_counter
-    load_and_align_16  v16, r7, r8, \increment_counter
-    compute_sum_sse \V, v16, v18, v19, v20, v21, v23
-.endm
-
-.macro load_and_align_16 V, R, P, increment_counter
-    lvsl    v17,  0, \R         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, \R
-    lvx     v22, r10, \R
-
-.if \increment_counter
-    add     \R, \R, \P
-.endif
-
-    vperm   \V, v21, v22, v17
-.endm
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp9_sub_pixel_variance4x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf830
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_4x4_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r12, r0
-    load_c v11, b_4567_b, 0, r12, r0
-
-    hfilter_8 v0, v10, v11, 1
-    hfilter_8 v1, v10, v11, 1
-    hfilter_8 v2, v10, v11, 1
-    hfilter_8 v3, v10, v11, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_4x4_b
-
-    hfilter_8 v4, v10, v11, 0
-
-    b   second_pass_4x4_b
-
-second_pass_4x4_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16 v0, r3, r4, 1
-    load_and_align_16 v1, r3, r4, 1
-    load_and_align_16 v2, r3, r4, 1
-    load_and_align_16 v3, r3, r4, 1
-    load_and_align_16 v4, r3, r4, 0
-
-second_pass_4x4_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-
-compute_sum_sse_4x4_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    load_and_align_16 v4, r7, r8, 1
-    load_and_align_16 v5, r7, r8, 1
-    load_and_align_16 v6, r7, r8, 1
-    load_and_align_16 v7, r7, r8, 1
-
-    vmrghb  v0, v0, v1
-    vmrghb  v1, v2, v3
-
-    vmrghb  v2, v4, v5
-    vmrghb  v3, v6, v7
-
-    load_c v10, b_hilo_b, 0, r12, r0
-
-    vperm   v0, v0, v1, v10
-    vperm   v1, v2, v3, v10
-
-    compute_sum_sse v0, v1, v18, v19, v20, v21, v23
-
-    variance_final v18, v19, v23, 4
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp9_sub_pixel_variance8x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfff0
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x8_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r12, r0
-    load_c v11, b_4567_b, 0, r12, r0
-
-    hfilter_8 v0, v10, v11, 1
-    hfilter_8 v1, v10, v11, 1
-    hfilter_8 v2, v10, v11, 1
-    hfilter_8 v3, v10, v11, 1
-    hfilter_8 v4, v10, v11, 1
-    hfilter_8 v5, v10, v11, 1
-    hfilter_8 v6, v10, v11, 1
-    hfilter_8 v7, v10, v11, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_8x8_b
-
-    hfilter_8 v8, v10, v11, 0
-
-    b   second_pass_8x8_b
-
-second_pass_8x8_pre_copy_b:
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16 v0, r3, r4, 1
-    load_and_align_16 v1, r3, r4, 1
-    load_and_align_16 v2, r3, r4, 1
-    load_and_align_16 v3, r3, r4, 1
-    load_and_align_16 v4, r3, r4, 1
-    load_and_align_16 v5, r3, r4, 1
-    load_and_align_16 v6, r3, r4, 1
-    load_and_align_16 v7, r3, r4, 1
-    load_and_align_16 v8, r3, r4, 0
-
-    beq     compute_sum_sse_8x8_b
-
-second_pass_8x8_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0, v1
-    vfilter_16 v1, v2
-    vfilter_16 v2, v3
-    vfilter_16 v3, v4
-    vfilter_16 v4, v5
-    vfilter_16 v5, v6
-    vfilter_16 v6, v7
-    vfilter_16 v7, v8
-
-compute_sum_sse_8x8_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    vmrghb  v0, v0, v1
-    vmrghb  v1, v2, v3
-    vmrghb  v2, v4, v5
-    vmrghb  v3, v6, v7
-
-    load_and_align_16 v4,  r7, r8, 1
-    load_and_align_16 v5,  r7, r8, 1
-    load_and_align_16 v6,  r7, r8, 1
-    load_and_align_16 v7,  r7, r8, 1
-    load_and_align_16 v8,  r7, r8, 1
-    load_and_align_16 v9,  r7, r8, 1
-    load_and_align_16 v10, r7, r8, 1
-    load_and_align_16 v11, r7, r8, 0
-
-    vmrghb  v4, v4,  v5
-    vmrghb  v5, v6,  v7
-    vmrghb  v6, v8,  v9
-    vmrghb  v7, v10, v11
-
-    compute_sum_sse v0, v4, v18, v19, v20, v21, v23
-    compute_sum_sse v1, v5, v18, v19, v20, v21, v23
-    compute_sum_sse v2, v6, v18, v19, v20, v21, v23
-    compute_sum_sse v3, v7, v18, v19, v20, v21, v23
-
-    variance_final v18, v19, v23, 6
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-    blr
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp9_sub_pixel_variance8x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfffc
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x16_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v29, b_0123_b, 0, r12, r0
-    load_c v30, b_4567_b, 0, r12, r0
-
-    hfilter_8 v0,  v29, v30, 1
-    hfilter_8 v1,  v29, v30, 1
-    hfilter_8 v2,  v29, v30, 1
-    hfilter_8 v3,  v29, v30, 1
-    hfilter_8 v4,  v29, v30, 1
-    hfilter_8 v5,  v29, v30, 1
-    hfilter_8 v6,  v29, v30, 1
-    hfilter_8 v7,  v29, v30, 1
-    hfilter_8 v8,  v29, v30, 1
-    hfilter_8 v9,  v29, v30, 1
-    hfilter_8 v10, v29, v30, 1
-    hfilter_8 v11, v29, v30, 1
-    hfilter_8 v12, v29, v30, 1
-    hfilter_8 v13, v29, v30, 1
-    hfilter_8 v14, v29, v30, 1
-    hfilter_8 v15, v29, v30, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_8x16_b
-
-    hfilter_8 v16, v29, v30, 0
-
-    b   second_pass_8x16_b
-
-second_pass_8x16_pre_copy_b:
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16 v0,  r3, r4, 1
-    load_and_align_16 v1,  r3, r4, 1
-    load_and_align_16 v2,  r3, r4, 1
-    load_and_align_16 v3,  r3, r4, 1
-    load_and_align_16 v4,  r3, r4, 1
-    load_and_align_16 v5,  r3, r4, 1
-    load_and_align_16 v6,  r3, r4, 1
-    load_and_align_16 v7,  r3, r4, 1
-    load_and_align_16 v8,  r3, r4, 1
-    load_and_align_16 v9,  r3, r4, 1
-    load_and_align_16 v10, r3, r4, 1
-    load_and_align_16 v11, r3, r4, 1
-    load_and_align_16 v12, r3, r4, 1
-    load_and_align_16 v13, r3, r4, 1
-    load_and_align_16 v14, r3, r4, 1
-    load_and_align_16 v15, r3, r4, 1
-    load_and_align_16 v16, r3, r4, 0
-
-    beq     compute_sum_sse_8x16_b
-
-second_pass_8x16_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-    vfilter_16 v8,  v9
-    vfilter_16 v9,  v10
-    vfilter_16 v10, v11
-    vfilter_16 v11, v12
-    vfilter_16 v12, v13
-    vfilter_16 v13, v14
-    vfilter_16 v14, v15
-    vfilter_16 v15, v16
-
-compute_sum_sse_8x16_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    vmrghb  v0, v0,  v1
-    vmrghb  v1, v2,  v3
-    vmrghb  v2, v4,  v5
-    vmrghb  v3, v6,  v7
-    vmrghb  v4, v8,  v9
-    vmrghb  v5, v10, v11
-    vmrghb  v6, v12, v13
-    vmrghb  v7, v14, v15
-
-    load_and_align_16 v8,  r7, r8, 1
-    load_and_align_16 v9,  r7, r8, 1
-    load_and_align_16 v10, r7, r8, 1
-    load_and_align_16 v11, r7, r8, 1
-    load_and_align_16 v12, r7, r8, 1
-    load_and_align_16 v13, r7, r8, 1
-    load_and_align_16 v14, r7, r8, 1
-    load_and_align_16 v15, r7, r8, 1
-
-    vmrghb  v8,  v8,  v9
-    vmrghb  v9,  v10, v11
-    vmrghb  v10, v12, v13
-    vmrghb  v11, v14, v15
-
-    compute_sum_sse v0, v8,  v18, v19, v20, v21, v23
-    compute_sum_sse v1, v9,  v18, v19, v20, v21, v23
-    compute_sum_sse v2, v10, v18, v19, v20, v21, v23
-    compute_sum_sse v3, v11, v18, v19, v20, v21, v23
-
-    load_and_align_16 v8,  r7, r8, 1
-    load_and_align_16 v9,  r7, r8, 1
-    load_and_align_16 v10, r7, r8, 1
-    load_and_align_16 v11, r7, r8, 1
-    load_and_align_16 v12, r7, r8, 1
-    load_and_align_16 v13, r7, r8, 1
-    load_and_align_16 v14, r7, r8, 1
-    load_and_align_16 v15, r7, r8, 0
-
-    vmrghb  v8,  v8,  v9
-    vmrghb  v9,  v10, v11
-    vmrghb  v10, v12, v13
-    vmrghb  v11, v14, v15
-
-    compute_sum_sse v4, v8,  v18, v19, v20, v21, v23
-    compute_sum_sse v5, v9,  v18, v19, v20, v21, v23
-    compute_sum_sse v6, v10, v18, v19, v20, v21, v23
-    compute_sum_sse v7, v11, v18, v19, v20, v21, v23
-
-    variance_final v18, v19, v23, 7
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-    blr
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-.macro hfilter_16 V, increment_counter
-
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-    lvx     v23, r12, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
-
-    ;# set 0
-    vmsummbm v24, v20, v21, v18 ;# taps times elements
-
-    ;# set 1
-    vsldoi  v23, v21, v22, 1
-    vmsummbm v25, v20, v23, v18
-
-    ;# set 2
-    vsldoi  v23, v21, v22, 2
-    vmsummbm v26, v20, v23, v18
-
-    ;# set 3
-    vsldoi  v23, v21, v22, 3
-    vmsummbm v27, v20, v23, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-    vsrh    v25, v25, v19
-
-    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
-    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
-.endm
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp9_sub_pixel_variance16x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1, -32(r1)         ;# create space on the stack
-
-    HProlog second_pass_16x8_pre_copy_b
-
-    hfilter_16 v0, 1
-    hfilter_16 v1, 1
-    hfilter_16 v2, 1
-    hfilter_16 v3, 1
-    hfilter_16 v4, 1
-    hfilter_16 v5, 1
-    hfilter_16 v6, 1
-    hfilter_16 v7, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_16x8_b
-
-    hfilter_16 v8, 0
-
-    b   second_pass_16x8_b
-
-second_pass_16x8_pre_copy_b:
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16  v0,  r3, r4, 1
-    load_and_align_16  v1,  r3, r4, 1
-    load_and_align_16  v2,  r3, r4, 1
-    load_and_align_16  v3,  r3, r4, 1
-    load_and_align_16  v4,  r3, r4, 1
-    load_and_align_16  v5,  r3, r4, 1
-    load_and_align_16  v6,  r3, r4, 1
-    load_and_align_16  v7,  r3, r4, 1
-    load_and_align_16  v8,  r3, r4, 1
-
-    beq     compute_sum_sse_16x8_b
-
-second_pass_16x8_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-
-compute_sum_sse_16x8_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    compute_sum_sse_16 v0, 1
-    compute_sum_sse_16 v1, 1
-    compute_sum_sse_16 v2, 1
-    compute_sum_sse_16 v3, 1
-    compute_sum_sse_16 v4, 1
-    compute_sum_sse_16 v5, 1
-    compute_sum_sse_16 v6, 1
-    compute_sum_sse_16 v7, 0
-
-    variance_final v18, v19, v23, 7
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp9_sub_pixel_variance16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1, -32(r1)         ;# create space on the stack
-
-    HProlog second_pass_16x16_pre_copy_b
-
-    hfilter_16 v0,  1
-    hfilter_16 v1,  1
-    hfilter_16 v2,  1
-    hfilter_16 v3,  1
-    hfilter_16 v4,  1
-    hfilter_16 v5,  1
-    hfilter_16 v6,  1
-    hfilter_16 v7,  1
-    hfilter_16 v8,  1
-    hfilter_16 v9,  1
-    hfilter_16 v10, 1
-    hfilter_16 v11, 1
-    hfilter_16 v12, 1
-    hfilter_16 v13, 1
-    hfilter_16 v14, 1
-    hfilter_16 v15, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_16x16_b
-
-    hfilter_16 v16, 0
-
-    b   second_pass_16x16_b
-
-second_pass_16x16_pre_copy_b:
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16  v0,  r3, r4, 1
-    load_and_align_16  v1,  r3, r4, 1
-    load_and_align_16  v2,  r3, r4, 1
-    load_and_align_16  v3,  r3, r4, 1
-    load_and_align_16  v4,  r3, r4, 1
-    load_and_align_16  v5,  r3, r4, 1
-    load_and_align_16  v6,  r3, r4, 1
-    load_and_align_16  v7,  r3, r4, 1
-    load_and_align_16  v8,  r3, r4, 1
-    load_and_align_16  v9,  r3, r4, 1
-    load_and_align_16  v10, r3, r4, 1
-    load_and_align_16  v11, r3, r4, 1
-    load_and_align_16  v12, r3, r4, 1
-    load_and_align_16  v13, r3, r4, 1
-    load_and_align_16  v14, r3, r4, 1
-    load_and_align_16  v15, r3, r4, 1
-    load_and_align_16  v16, r3, r4, 0
-
-    beq     compute_sum_sse_16x16_b
-
-second_pass_16x16_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-    vfilter_16 v8,  v9
-    vfilter_16 v9,  v10
-    vfilter_16 v10, v11
-    vfilter_16 v11, v12
-    vfilter_16 v12, v13
-    vfilter_16 v13, v14
-    vfilter_16 v14, v15
-    vfilter_16 v15, v16
-
-compute_sum_sse_16x16_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    compute_sum_sse_16 v0,  1
-    compute_sum_sse_16 v1,  1
-    compute_sum_sse_16 v2,  1
-    compute_sum_sse_16 v3,  1
-    compute_sum_sse_16 v4,  1
-    compute_sum_sse_16 v5,  1
-    compute_sum_sse_16 v6,  1
-    compute_sum_sse_16 v7,  1
-    compute_sum_sse_16 v8,  1
-    compute_sum_sse_16 v9,  1
-    compute_sum_sse_16 v10, 1
-    compute_sum_sse_16 v11, 1
-    compute_sum_sse_16 v12, 1
-    compute_sum_sse_16 v13, 1
-    compute_sum_sse_16 v14, 1
-    compute_sum_sse_16 v15, 0
-
-    variance_final v18, v19, v23, 8
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-    .align 4
-hfilter_b:
-    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
-    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
-    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
-    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
-    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
-    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
-    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
-    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
-
-    .align 4
-vfilter_b:
-    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-
-    .align 4
-b_hperm_b:
-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
-
-    .align 4
-b_0123_b:
-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
-
-    .align 4
-b_4567_b:
-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
-
-b_hilo_b:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
--- a/vp9/encoder/vp9_asm_enc_offsets.c
+++ b/vp9/encoder/vp9_asm_enc_offsets.c
@@ -10,31 +10,8 @@
 
 
 #include "vpx_ports/asm_offsets.h"
-#include "vpx_config.h"
-#include "vp9/encoder/vp9_block.h"
-#include "vp9/common/vp9_blockd.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_treewriter.h"
-#include "vp9/encoder/vp9_tokenize.h"
 
 BEGIN
 
-/* regular quantize */
-DEFINE(vp9_block_coeff,                         offsetof(BLOCK, coeff));
-DEFINE(vp9_block_zbin,                          offsetof(BLOCK, zbin));
-DEFINE(vp9_block_round,                         offsetof(BLOCK, round));
-DEFINE(vp9_block_quant,                         offsetof(BLOCK, quant));
-DEFINE(vp9_block_quant_fast,                    offsetof(BLOCK, quant_fast));
-DEFINE(vp9_block_zbin_extra,                    offsetof(BLOCK, zbin_extra));
-DEFINE(vp9_block_zrun_zbin_boost,               offsetof(BLOCK, zrun_zbin_boost));
-DEFINE(vp9_block_quant_shift,                   offsetof(BLOCK, quant_shift));
 
-DEFINE(vp9_blockd_qcoeff,                       offsetof(BLOCKD, qcoeff));
-DEFINE(vp9_blockd_dequant,                      offsetof(BLOCKD, dequant));
-DEFINE(vp9_blockd_dqcoeff,                      offsetof(BLOCKD, dqcoeff));
-
 END
-
-/* add asserts for any offset that is not supported by assembly code
- * add asserts for any size that is not supported by assembly code
- */
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -8,354 +8,300 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-#include "vp9/common/vp9_header.h"
-#include "vp9/encoder/vp9_encodemv.h"
-#include "vp9/common/vp9_entropymode.h"
-#include "vp9/common/vp9_entropymv.h"
-#include "vp9/common/vp9_findnearmv.h"
-#include "vp9/common/vp9_tile_common.h"
-#include "vp9/encoder/vp9_mcomp.h"
-#include "vp9/common/vp9_systemdependent.h"
 #include <assert.h>
 #include <stdio.h>
 #include <limits.h>
-#include "vp9/common/vp9_pragmas.h"
+
 #include "vpx/vpx_encoder.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vp9/encoder/vp9_bitstream.h"
-#include "vp9/encoder/vp9_segmentation.h"
 
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_findnearmv.h"
+#include "vp9/common/vp9_tile_common.h"
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_entropy.h"
-#include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_treecoder.h"
+#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_pragmas.h"
 
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_bitstream.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_write_bit_buffer.h"
+
+
 #if defined(SECTIONBITS_OUTPUT)
 unsigned __int64 Sectionbits[500];
 #endif
 
 #ifdef ENTROPY_STATS
-int intra_mode_stats[VP9_KF_BINTRAMODES]
-                    [VP9_KF_BINTRAMODES]
-                    [VP9_KF_BINTRAMODES];
-vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES];
-vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES];
-vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES];
-vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES];
+int intra_mode_stats[VP9_INTRA_MODES]
+                    [VP9_INTRA_MODES]
+                    [VP9_INTRA_MODES];
+vp9_coeff_stats tree_update_hist[TX_SIZE_MAX_SB][BLOCK_TYPES];
 
 extern unsigned int active_section;
 #endif
 
-#if CONFIG_CODE_NONZEROCOUNT
-#ifdef NZC_STATS
-unsigned int nzc_stats_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                          [NZC4X4_TOKENS];
-unsigned int nzc_stats_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                          [NZC8X8_TOKENS];
-unsigned int nzc_stats_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                          [NZC16X16_TOKENS];
-unsigned int nzc_stats_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                          [NZC32X32_TOKENS];
-unsigned int nzc_pcat_stats[MAX_NZC_CONTEXTS][NZC_TOKENS_EXTRA]
-                          [NZC_BITS_EXTRA][2];
-void init_nzcstats();
-void update_nzcstats(VP9_COMMON *const cm);
-void print_nzcstats();
-#endif
-#endif
-
-#ifdef MODE_STATS
-int count_mb_seg[4] = { 0, 0, 0, 0 };
-#endif
-
 #define vp9_cost_upd  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)) >> 8)
 #define vp9_cost_upd256  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
 
-#define SEARCH_NEWP
-static int update_bits[255];
+#ifdef MODE_STATS
+int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB];
+int64_t tx_count_16x16p_stats[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
+int64_t tx_count_8x8p_stats[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
+int64_t switchable_interp_stats[VP9_SWITCHABLE_FILTERS+1]
+                               [VP9_SWITCHABLE_FILTERS];
 
-static void compute_update_table() {
-  int i;
-  for (i = 0; i < 255; i++)
-    update_bits[i] = vp9_count_term_subexp(i, SUBEXP_PARAM, 255);
+void init_tx_count_stats() {
+  vp9_zero(tx_count_32x32p_stats);
+  vp9_zero(tx_count_16x16p_stats);
+  vp9_zero(tx_count_8x8p_stats);
 }
 
-static int split_index(int i, int n, int modulus) {
-  int max1 = (n - 1 - modulus / 2) / modulus + 1;
-  if (i % modulus == modulus / 2) i = i / modulus;
-  else i = max1 + i - (i + modulus - modulus / 2) / modulus;
-  return i;
+void init_switchable_interp_stats() {
+  vp9_zero(switchable_interp_stats);
 }
 
-static int remap_prob(int v, int m) {
-  const int n = 256;
-  const int modulus = MODULUS_PARAM;
-  int i;
-  if ((m << 1) <= n)
-    i = vp9_recenter_nonneg(v, m) - 1;
-  else
-    i = vp9_recenter_nonneg(n - 1 - v, n - 1 - m) - 1;
-
-  i = split_index(i, n - 1, modulus);
-  return i;
+static void update_tx_count_stats(VP9_COMMON *cm) {
+  int i, j;
+  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+    for (j = 0; j < TX_SIZE_MAX_SB; j++) {
+      tx_count_32x32p_stats[i][j] += cm->fc.tx_count_32x32p[i][j];
+    }
+  }
+  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+    for (j = 0; j < TX_SIZE_MAX_SB - 1; j++) {
+      tx_count_16x16p_stats[i][j] += cm->fc.tx_count_16x16p[i][j];
+    }
+  }
+  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+    for (j = 0; j < TX_SIZE_MAX_SB - 2; j++) {
+      tx_count_8x8p_stats[i][j] += cm->fc.tx_count_8x8p[i][j];
+    }
+  }
 }
 
-static void write_prob_diff_update(vp9_writer *const bc,
-                                   vp9_prob newp, vp9_prob oldp) {
-  int delp = remap_prob(newp, oldp);
-  vp9_encode_term_subexp(bc, delp, SUBEXP_PARAM, 255);
+static void update_switchable_interp_stats(VP9_COMMON *cm) {
+  int i, j;
+  for (i = 0; i < VP9_SWITCHABLE_FILTERS+1; ++i)
+    for (j = 0; j < VP9_SWITCHABLE_FILTERS; ++j) {
+      switchable_interp_stats[i][j] += cm->fc.switchable_interp_count[i][j];
+    }
 }
 
-static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {
-  int delp = remap_prob(newp, oldp);
-  return update_bits[delp] * 256;
+void write_tx_count_stats() {
+  int i, j;
+  FILE *fp = fopen("tx_count.bin", "wb");
+  fwrite(tx_count_32x32p_stats, sizeof(tx_count_32x32p_stats), 1, fp);
+  fwrite(tx_count_16x16p_stats, sizeof(tx_count_16x16p_stats), 1, fp);
+  fwrite(tx_count_8x8p_stats, sizeof(tx_count_8x8p_stats), 1, fp);
+  fclose(fp);
+
+  printf(
+      "vp9_default_tx_count_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB] = {\n");
+  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+    printf("  { ");
+    for (j = 0; j < TX_SIZE_MAX_SB; j++) {
+      printf("%"PRId64", ", tx_count_32x32p_stats[i][j]);
+    }
+    printf("},\n");
+  }
+  printf("};\n");
+  printf(
+      "vp9_default_tx_count_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB-1] = {\n");
+  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+    printf("  { ");
+    for (j = 0; j < TX_SIZE_MAX_SB - 1; j++) {
+      printf("%"PRId64", ", tx_count_16x16p_stats[i][j]);
+    }
+    printf("},\n");
+  }
+  printf("};\n");
+  printf(
+      "vp9_default_tx_count_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB-2] = {\n");
+  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+    printf("  { ");
+    for (j = 0; j < TX_SIZE_MAX_SB - 2; j++) {
+      printf("%"PRId64", ", tx_count_8x8p_stats[i][j]);
+    }
+    printf("},\n");
+  }
+  printf("};\n");
 }
 
-static void update_mode(
-  vp9_writer *const bc,
-  int n,
-  vp9_token tok               [/* n */],
-  vp9_tree tree,
-  vp9_prob Pnew               [/* n-1 */],
-  vp9_prob Pcur               [/* n-1 */],
-  unsigned int bct            [/* n-1 */] [2],
-  const unsigned int num_events[/* n */]
-) {
-  unsigned int new_b = 0, old_b = 0;
-  int i = 0;
+void write_switchable_interp_stats() {
+  int i, j;
+  FILE *fp = fopen("switchable_interp.bin", "wb");
+  fwrite(switchable_interp_stats, sizeof(switchable_interp_stats), 1, fp);
+  fclose(fp);
 
-  vp9_tree_probs_from_distribution(tree, Pnew, bct, num_events, 0);
-  n--;
+  printf(
+      "vp9_default_switchable_filter_count[VP9_SWITCHABLE_FILTERS+1]"
+      "[VP9_SWITCHABLE_FILTERS] = {\n");
+  for (i = 0; i < VP9_SWITCHABLE_FILTERS+1; i++) {
+    printf("  { ");
+    for (j = 0; j < VP9_SWITCHABLE_FILTERS; j++) {
+      printf("%"PRId64", ", switchable_interp_stats[i][j]);
+    }
+    printf("},\n");
+  }
+  printf("};\n");
+}
+#endif
 
-  do {
-    new_b += cost_branch(bct[i], Pnew[i]);
-    old_b += cost_branch(bct[i], Pcur[i]);
-  } while (++i < n);
+static int update_bits[255];
 
-  if (new_b + (n << 8) < old_b) {
-    int i = 0;
+static INLINE void write_be32(uint8_t *p, int value) {
+  p[0] = value >> 24;
+  p[1] = value >> 16;
+  p[2] = value >> 8;
+  p[3] = value;
+}
 
-    vp9_write_bit(bc, 1);
 
-    do {
-      const vp9_prob p = Pnew[i];
 
-      vp9_write_literal(bc, Pcur[i] = p ? p : 1, 8);
-    } while (++i < n);
-  } else
-    vp9_write_bit(bc, 0);
+int recenter_nonneg(int v, int m) {
+  if (v > (m << 1))
+    return v;
+  else if (v >= m)
+    return ((v - m) << 1);
+  else
+    return ((m - v) << 1) - 1;
 }
 
-static void update_mbintra_mode_probs(VP9_COMP* const cpi,
-                                      vp9_writer* const bc) {
-  VP9_COMMON *const cm = &cpi->common;
-
-  {
-    vp9_prob Pnew   [VP9_YMODES - 1];
-    unsigned int bct [VP9_YMODES - 1] [2];
-
-    update_mode(
-      bc, VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree,
-      Pnew, cm->fc.ymode_prob, bct, (unsigned int *)cpi->ymode_count
-    );
-    update_mode(bc, VP9_I32X32_MODES, vp9_sb_ymode_encodings,
-                vp9_sb_ymode_tree, Pnew, cm->fc.sb_ymode_prob, bct,
-                (unsigned int *)cpi->sb_ymode_count);
+static int get_unsigned_bits(unsigned num_values) {
+  int cat = 0;
+  if ((num_values--) <= 1) return 0;
+  while (num_values > 0) {
+    cat++;
+    num_values >>= 1;
   }
+  return cat;
 }
 
-void vp9_update_skip_probs(VP9_COMP *cpi) {
-  VP9_COMMON *const pc = &cpi->common;
-  int k;
-
-  for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
-    pc->mbskip_pred_probs[k] = get_binary_prob(cpi->skip_false_count[k],
-                                               cpi->skip_true_count[k]);
-  }
+void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb,
+                             int data, int max) {
+  vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
 }
 
-static void update_switchable_interp_probs(VP9_COMP *cpi,
-                                           vp9_writer* const bc) {
-  VP9_COMMON *const pc = &cpi->common;
-  unsigned int branch_ct[32][2];
-  int i, j;
-  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
-    vp9_tree_probs_from_distribution(
-        vp9_switchable_interp_tree,
-        pc->fc.switchable_interp_prob[j], branch_ct,
-        cpi->switchable_interp_count[j], 0);
-    for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
-      if (pc->fc.switchable_interp_prob[j][i] < 1)
-        pc->fc.switchable_interp_prob[j][i] = 1;
-      vp9_write_literal(bc, pc->fc.switchable_interp_prob[j][i], 8);
-    }
+void encode_uniform(vp9_writer *w, int v, int n) {
+  int l = get_unsigned_bits(n);
+  int m;
+  if (l == 0)
+    return;
+  m = (1 << l) - n;
+  if (v < m) {
+    vp9_write_literal(w, v, l - 1);
+  } else {
+    vp9_write_literal(w, m + ((v - m) >> 1), l - 1);
+    vp9_write_literal(w, (v - m) & 1, 1);
   }
 }
 
-// This function updates the reference frame prediction stats
-static void update_refpred_stats(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  int i;
-  vp9_prob new_pred_probs[PREDICTION_PROBS];
-  int old_cost, new_cost;
-
-  // Set the prediction probability structures to defaults
-  if (cm->frame_type != KEY_FRAME) {
-    // From the prediction counts set the probabilities for each context
-    for (i = 0; i < PREDICTION_PROBS; i++) {
-      new_pred_probs[i] = get_binary_prob(cpi->ref_pred_count[i][0],
-                                          cpi->ref_pred_count[i][1]);
-
-      // Decide whether or not to update the reference frame probs.
-      // Returned costs are in 1/256 bit units.
-      old_cost =
-        (cpi->ref_pred_count[i][0] * vp9_cost_zero(cm->ref_pred_probs[i])) +
-        (cpi->ref_pred_count[i][1] * vp9_cost_one(cm->ref_pred_probs[i]));
-
-      new_cost =
-        (cpi->ref_pred_count[i][0] * vp9_cost_zero(new_pred_probs[i])) +
-        (cpi->ref_pred_count[i][1] * vp9_cost_one(new_pred_probs[i]));
-
-      // Cost saving must be >= 8 bits (2048 in these units)
-      if ((old_cost - new_cost) >= 2048) {
-        cpi->ref_pred_probs_update[i] = 1;
-        cm->ref_pred_probs[i] = new_pred_probs[i];
-      } else
-        cpi->ref_pred_probs_update[i] = 0;
-    }
-  }
+int count_uniform(int v, int n) {
+  int l = get_unsigned_bits(n);
+  int m;
+  if (l == 0) return 0;
+  m = (1 << l) - n;
+  if (v < m)
+    return l - 1;
+  else
+    return l;
 }
 
-// This function is called to update the mode probability context used to encode
-// inter modes. It assumes the branch counts table has already been populated
-// prior to the actual packing of the bitstream (in rd stage or dummy pack)
-//
-// The branch counts table is re-populated during the actual pack stage and in
-// the decoder to facilitate backwards update of the context.
-static void update_inter_mode_probs(VP9_COMMON *cm,
-                                    int mode_context[INTER_MODE_CONTEXTS][4]) {
-  int i, j;
-  unsigned int (*mv_ref_ct)[4][2];
-
-  vpx_memcpy(mode_context, cm->fc.vp9_mode_contexts,
-             sizeof(cm->fc.vp9_mode_contexts));
-
-  mv_ref_ct = cm->fc.mv_ref_ct;
-
-  for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
-    for (j = 0; j < 4; j++) {
-      int new_prob, old_cost, new_cost;
-
-      // Work out cost of coding branches with the old and optimal probability
-      old_cost = cost_branch256(mv_ref_ct[i][j], mode_context[i][j]);
-      new_prob = get_binary_prob(mv_ref_ct[i][j][0], mv_ref_ct[i][j][1]);
-      new_cost = cost_branch256(mv_ref_ct[i][j], new_prob);
-
-      // If cost saving is >= 14 bits then update the mode probability.
-      // This is the approximate net cost of updating one probability given
-      // that the no update case ismuch more common than the update case.
-      if (new_cost <= (old_cost - (14 << 8))) {
-        mode_context[i][j] = new_prob;
+void encode_term_subexp(vp9_writer *w, int word, int k, int num_syms) {
+  int i = 0;
+  int mk = 0;
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+    if (num_syms <= mk + 3 * a) {
+      encode_uniform(w, word - mk, num_syms - mk);
+      break;
+    } else {
+      int t = (word >= mk + a);
+      vp9_write_literal(w, t, 1);
+      if (t) {
+        i = i + 1;
+        mk += a;
+      } else {
+        vp9_write_literal(w, word - mk, b);
+        break;
       }
     }
   }
 }
 
-#if CONFIG_NEW_MVREF
-static void update_mv_ref_probs(VP9_COMP *cpi,
-                                int mvref_probs[MAX_REF_FRAMES]
-                                               [MAX_MV_REF_CANDIDATES-1]) {
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  int rf;     // Reference frame
-  int ref_c;  // Motion reference candidate
-  int node;   // Probability node index
-
-  for (rf = 0; rf < MAX_REF_FRAMES; ++rf) {
-    int count = 0;
-
-    // Skip the dummy entry for intra ref frame.
-    if (rf == INTRA_FRAME) {
-      continue;
-    }
-
-    // Sum the counts for all candidates
-    for (ref_c = 0; ref_c < MAX_MV_REF_CANDIDATES; ++ref_c) {
-      count += cpi->mb_mv_ref_count[rf][ref_c];
-    }
-
-    // Calculate the tree node probabilities
-    for (node = 0; node < MAX_MV_REF_CANDIDATES-1; ++node) {
-      int new_prob, old_cost, new_cost;
-      unsigned int branch_cnts[2];
-
-      // How many hits on each branch at this node
-      branch_cnts[0] = cpi->mb_mv_ref_count[rf][node];
-      branch_cnts[1] = count - cpi->mb_mv_ref_count[rf][node];
-
-      // Work out cost of coding branches with the old and optimal probability
-      old_cost = cost_branch256(branch_cnts, xd->mb_mv_ref_probs[rf][node]);
-      new_prob = get_prob(branch_cnts[0], count);
-      new_cost = cost_branch256(branch_cnts, new_prob);
-
-      // Take current 0 branch cases out of residual count
-      count -= cpi->mb_mv_ref_count[rf][node];
-
-      if ((new_cost + VP9_MV_REF_UPDATE_COST) <= old_cost) {
-        mvref_probs[rf][node] = new_prob;
+int count_term_subexp(int word, int k, int num_syms) {
+  int count = 0;
+  int i = 0;
+  int mk = 0;
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+    if (num_syms <= mk + 3 * a) {
+      count += count_uniform(word - mk, num_syms - mk);
+      break;
+    } else {
+      int t = (word >= mk + a);
+      count++;
+      if (t) {
+        i = i + 1;
+        mk += a;
       } else {
-        mvref_probs[rf][node] = xd->mb_mv_ref_probs[rf][node];
+        count += b;
+        break;
       }
     }
   }
+  return count;
 }
-#endif
 
-static void write_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_ymode_tree, p, vp9_ymode_encodings + m);
+static void compute_update_table() {
+  int i;
+  for (i = 0; i < 254; i++)
+    update_bits[i] = count_term_subexp(i, SUBEXP_PARAM, 255);
 }
 
-static void kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_kf_ymode_tree, p, vp9_kf_ymode_encodings + m);
+static int split_index(int i, int n, int modulus) {
+  int max1 = (n - 1 - modulus / 2) / modulus + 1;
+  if (i % modulus == modulus / 2) i = i / modulus;
+  else i = max1 + i - (i + modulus - modulus / 2) / modulus;
+  return i;
 }
 
-static void write_sb_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_sb_ymode_tree, p, vp9_sb_ymode_encodings + m);
-}
+static int remap_prob(int v, int m) {
+  const int n = 255;
+  const int modulus = MODULUS_PARAM;
+  int i;
+  v--;
+  m--;
+  if ((m << 1) <= n)
+    i = recenter_nonneg(v, m) - 1;
+  else
+    i = recenter_nonneg(n - 1 - v, n - 1 - m) - 1;
 
-static void sb_kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_uv_mode_tree, p, vp9_sb_kf_ymode_encodings + m);
+  i = split_index(i, n - 1, modulus);
+  return i;
 }
 
-static void write_i8x8_mode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_i8x8_mode_tree, p, vp9_i8x8_mode_encodings + m);
+static void write_prob_diff_update(vp9_writer *w,
+                                   vp9_prob newp, vp9_prob oldp) {
+  int delp = remap_prob(newp, oldp);
+  encode_term_subexp(w, delp, SUBEXP_PARAM, 255);
 }
 
-static void write_uv_mode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_uv_mode_tree, p, vp9_uv_mode_encodings + m);
+static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {
+  int delp = remap_prob(newp, oldp);
+  return update_bits[delp] * 256;
 }
 
-
-static void write_bmode(vp9_writer *bc, int m, const vp9_prob *p) {
-#if CONFIG_NEWBINTRAMODES
-  assert(m < B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS || m == B_CONTEXT_PRED);
-  if (m == B_CONTEXT_PRED) m -= CONTEXT_PRED_REPLACEMENTS;
-#endif
-  write_token(bc, vp9_bmode_tree, p, vp9_bmode_encodings + m);
-}
-
-static void write_kf_bmode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_kf_bmode_tree, p, vp9_kf_bmode_encodings + m);
-}
-
-static void write_split(vp9_writer *bc, int x, const vp9_prob *p) {
-  write_token(
-    bc, vp9_mbsplit_tree, p, vp9_mbsplit_encodings + x);
-}
-
 static int prob_update_savings(const unsigned int *ct,
                                const vp9_prob oldp, const vp9_prob newp,
                                const vp9_prob upd) {
@@ -362,19 +308,9 @@
   const int old_b = cost_branch256(ct, oldp);
   const int new_b = cost_branch256(ct, newp);
   const int update_b = 2048 + vp9_cost_upd256;
-  return (old_b - new_b - update_b);
+  return old_b - new_b - update_b;
 }
 
-static int prob_diff_update_savings(const unsigned int *ct,
-                                    const vp9_prob oldp, const vp9_prob newp,
-                                    const vp9_prob upd) {
-  const int old_b = cost_branch256(ct, oldp);
-  const int new_b = cost_branch256(ct, newp);
-  const int update_b = (newp == oldp ? 0 :
-                        prob_diff_update_cost(newp, oldp) + vp9_cost_upd256);
-  return (old_b - new_b - update_b);
-}
-
 static int prob_diff_update_savings_search(const unsigned int *ct,
                                            const vp9_prob oldp, vp9_prob *bestp,
                                            const vp9_prob upd) {
@@ -399,7 +335,6 @@
   return bestsavings;
 }
 
-#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE
 static int prob_diff_update_savings_search_model(const unsigned int *ct,
                                                  const vp9_prob *oldp,
                                                  vp9_prob *bestp,
@@ -407,23 +342,26 @@
                                                  int b, int r) {
   int i, old_b, new_b, update_b, savings, bestsavings, step;
   int newp;
-  vp9_prob bestnewp, newplist[ENTROPY_NODES];
-  for (i = UNCONSTRAINED_NODES - 1, old_b = 0; i < ENTROPY_NODES; ++i)
-    old_b += cost_branch256(ct + 2 * i, oldp[i]);
+  vp9_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
+  vp9_model_to_full_probs(oldp, oldplist);
+  vpx_memcpy(newplist, oldp, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+  for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
+    old_b += cost_branch256(ct + 2 * i, oldplist[i]);
+  old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]);
 
   bestsavings = 0;
-  bestnewp = oldp[UNCONSTRAINED_NODES - 1];
+  bestnewp = oldp[PIVOT_NODE];
 
-  step = (*bestp > oldp[UNCONSTRAINED_NODES - 1] ? -1 : 1);
+  step = (*bestp > oldp[PIVOT_NODE] ? -1 : 1);
   newp = *bestp;
-  // newp = *bestp - step * (abs(*bestp - oldp[UNCONSTRAINED_NODES - 1]) >> 1);
-  for (; newp != oldp[UNCONSTRAINED_NODES - 1]; newp += step) {
+  for (; newp != oldp[PIVOT_NODE]; newp += step) {
     if (newp < 1 || newp > 255) continue;
-    newplist[UNCONSTRAINED_NODES - 1] = newp;
-    vp9_get_model_distribution(newp, newplist, b, r);
-    for (i = UNCONSTRAINED_NODES - 1, new_b = 0; i < ENTROPY_NODES; ++i)
+    newplist[PIVOT_NODE] = newp;
+    vp9_model_to_full_probs(newplist, newplist);
+    for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
       new_b += cost_branch256(ct + 2 * i, newplist[i]);
-    update_b = prob_diff_update_cost(newp, oldp[UNCONSTRAINED_NODES - 1]) +
+    new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
+    update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
         vp9_cost_upd256;
     savings = old_b - new_b - update_b;
     if (savings > bestsavings) {
@@ -434,7 +372,6 @@
   *bestp = bestnewp;
   return bestsavings;
 }
-#endif
 
 static void vp9_cond_prob_update(vp9_writer *bc, vp9_prob *oldp, vp9_prob upd,
                                  unsigned int *ct) {
@@ -441,10 +378,11 @@
   vp9_prob newp;
   int savings;
   newp = get_binary_prob(ct[0], ct[1]);
+  assert(newp >= 1);
   savings = prob_update_savings(ct, *oldp, newp, upd);
   if (savings > 0) {
     vp9_write(bc, 1, upd);
-    vp9_write_literal(bc, newp, 8);
+    vp9_write_prob(bc, newp);
     *oldp = newp;
   } else {
     vp9_write(bc, 0, upd);
@@ -451,6 +389,108 @@
   }
 }
 
+static void vp9_cond_prob_diff_update(vp9_writer *bc, vp9_prob *oldp,
+                                      vp9_prob upd,
+                                      unsigned int *ct) {
+  vp9_prob newp;
+  int savings;
+  newp = get_binary_prob(ct[0], ct[1]);
+  assert(newp >= 1);
+  savings = prob_diff_update_savings_search(ct, *oldp, &newp, upd);
+  if (savings > 0) {
+    vp9_write(bc, 1, upd);
+    write_prob_diff_update(bc, newp, *oldp);
+    *oldp = newp;
+  } else {
+    vp9_write(bc, 0, upd);
+  }
+}
+
+static void update_mode(
+  vp9_writer *w,
+  int n,
+  const struct vp9_token tok[/* n */],
+  vp9_tree tree,
+  vp9_prob Pnew[/* n-1 */],
+  vp9_prob Pcur[/* n-1 */],
+  unsigned int bct[/* n-1 */] [2],
+  const unsigned int num_events[/* n */]
+) {
+  int i = 0;
+
+  vp9_tree_probs_from_distribution(tree, Pnew, bct, num_events, 0);
+  n--;
+
+  for (i = 0; i < n; ++i) {
+    vp9_cond_prob_diff_update(w, &Pcur[i], VP9_MODE_UPDATE_PROB, bct[i]);
+  }
+}
+
+static void update_mbintra_mode_probs(VP9_COMP* const cpi,
+                                      vp9_writer* const bc) {
+  VP9_COMMON *const cm = &cpi->common;
+  int j;
+  vp9_prob pnew[VP9_INTRA_MODES - 1];
+  unsigned int bct[VP9_INTRA_MODES - 1][2];
+
+  for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
+    update_mode(bc, VP9_INTRA_MODES, vp9_intra_mode_encodings,
+                vp9_intra_mode_tree, pnew,
+                cm->fc.y_mode_prob[j], bct,
+                (unsigned int *)cpi->y_mode_count[j]);
+}
+
+void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *bc) {
+  VP9_COMMON *const pc = &cpi->common;
+  int k;
+
+  for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
+    vp9_cond_prob_diff_update(bc, &pc->fc.mbskip_probs[k],
+                              VP9_MODE_UPDATE_PROB, pc->fc.mbskip_count[k]);
+  }
+}
+
+static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {
+  write_token(bc, vp9_intra_mode_tree, p, vp9_intra_mode_encodings + m);
+}
+
+static void update_switchable_interp_probs(VP9_COMP *const cpi,
+                                           vp9_writer* const bc) {
+  VP9_COMMON *const pc = &cpi->common;
+  unsigned int branch_ct[VP9_SWITCHABLE_FILTERS + 1]
+                        [VP9_SWITCHABLE_FILTERS - 1][2];
+  vp9_prob new_prob[VP9_SWITCHABLE_FILTERS + 1][VP9_SWITCHABLE_FILTERS - 1];
+  int i, j;
+  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
+    vp9_tree_probs_from_distribution(
+        vp9_switchable_interp_tree,
+        new_prob[j], branch_ct[j],
+        pc->fc.switchable_interp_count[j], 0);
+  }
+  for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
+    for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
+      vp9_cond_prob_diff_update(bc, &pc->fc.switchable_interp_prob[j][i],
+                                VP9_MODE_UPDATE_PROB, branch_ct[j][i]);
+    }
+  }
+#ifdef MODE_STATS
+  if (!cpi->dummy_packing)
+    update_switchable_interp_stats(pc);
+#endif
+}
+
+static void update_inter_mode_probs(VP9_COMMON *pc, vp9_writer* const bc) {
+  int i, j;
+
+  for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
+    for (j = 0; j < VP9_INTER_MODES - 1; j++) {
+      vp9_cond_prob_diff_update(bc, &pc->fc.inter_mode_probs[i][j],
+                                VP9_MODE_UPDATE_PROB,
+                                pc->fc.inter_mode_counts[i][j]);
+    }
+  }
+}
+
 static void pack_mb_tokens(vp9_writer* const bc,
                            TOKENEXTRA **tp,
                            const TOKENEXTRA *const stop) {
@@ -457,50 +497,65 @@
   TOKENEXTRA *p = *tp;
 
   while (p < stop) {
-    const int t = p->Token;
-    vp9_token *const a = vp9_coef_encodings + t;
-    const vp9_extra_bit_struct *const b = vp9_extra_bits + t;
+    const int t = p->token;
+    const struct vp9_token *const a = vp9_coef_encodings + t;
+    const vp9_extra_bit *const b = vp9_extra_bits + t;
     int i = 0;
-    const unsigned char *pp = p->context_tree;
+    const vp9_prob *pp;
     int v = a->value;
-    int n = a->Len;
+    int n = a->len;
+    vp9_prob probs[ENTROPY_NODES];
 
-    if (t == EOSB_TOKEN)
-    {
+    if (t == EOSB_TOKEN) {
       ++p;
       break;
     }
+    if (t >= TWO_TOKEN) {
+      vp9_model_to_full_probs(p->context_tree, probs);
+      pp = probs;
+    } else {
+      pp = p->context_tree;
+    }
+    assert(pp != 0);
 
     /* skip one or two nodes */
+#if !CONFIG_BALANCED_COEFTREE
     if (p->skip_eob_node) {
       n -= p->skip_eob_node;
       i = 2 * p->skip_eob_node;
     }
+#endif
 
     do {
       const int bb = (v >> --n) & 1;
-      encode_bool(bc, bb, pp[i >> 1]);
+#if CONFIG_BALANCED_COEFTREE
+      if (i == 2 && p->skip_eob_node) {
+        i += 2;
+        assert(bb == 1);
+        continue;
+      }
+#endif
+      vp9_write(bc, bb, pp[i >> 1]);
       i = vp9_coef_tree[i + bb];
     } while (n);
 
-
     if (b->base_val) {
-      const int e = p->Extra, L = b->Len;
+      const int e = p->extra, l = b->len;
 
-      if (L) {
-        const unsigned char *pp = b->prob;
+      if (l) {
+        const unsigned char *pb = b->prob;
         int v = e >> 1;
-        int n = L;              /* number of bits in v, assumed nonzero */
+        int n = l;              /* number of bits in v, assumed nonzero */
         int i = 0;
 
         do {
           const int bb = (v >> --n) & 1;
-          encode_bool(bc, bb, pp[i >> 1]);
+          vp9_write(bc, bb, pb[i >> 1]);
           i = b->tree[i + bb];
         } while (n);
       }
 
-      encode_bool(bc, e & 1, 128);
+      vp9_write_bit(bc, e & 1);
     }
     ++p;
   }
@@ -508,225 +563,60 @@
   *tp = p;
 }
 
-static void write_partition_size(unsigned char *cx_data, int size) {
-  signed char csize;
-
-  csize = size & 0xff;
-  *cx_data = csize;
-  csize = (size >> 8) & 0xff;
-  *(cx_data + 1) = csize;
-  csize = (size >> 16) & 0xff;
-  *(cx_data + 2) = csize;
-
-}
-
-static void write_mv_ref
-(
-  vp9_writer *bc, MB_PREDICTION_MODE m, const vp9_prob *p
-) {
-#if CONFIG_DEBUG
-  assert(NEARESTMV <= m  &&  m <= SPLITMV);
-#endif
-  write_token(bc, vp9_mv_ref_tree, p,
-              vp9_mv_ref_encoding_array - NEARESTMV + m);
-}
-
 static void write_sb_mv_ref(vp9_writer *bc, MB_PREDICTION_MODE m,
                             const vp9_prob *p) {
 #if CONFIG_DEBUG
-  assert(NEARESTMV <= m  &&  m < SPLITMV);
+  assert(NEARESTMV <= m && m <= NEWMV);
 #endif
   write_token(bc, vp9_sb_mv_ref_tree, p,
               vp9_sb_mv_ref_encoding_array - NEARESTMV + m);
 }
 
-static void write_sub_mv_ref
-(
-  vp9_writer *bc, B_PREDICTION_MODE m, const vp9_prob *p
-) {
-#if CONFIG_DEBUG
-  assert(LEFT4X4 <= m  &&  m <= NEW4X4);
-#endif
-  write_token(bc, vp9_sub_mv_ref_tree, p,
-              vp9_sub_mv_ref_encoding_array - LEFT4X4 + m);
-}
-
-static void write_nmv(VP9_COMP *cpi, vp9_writer *bc,
-                      const MV *mv, const int_mv *ref,
-                      const nmv_context *nmvc, int usehp) {
-  MV e;
-  e.row = mv->row - ref->as_mv.row;
-  e.col = mv->col - ref->as_mv.col;
-
-  vp9_encode_nmv(bc, &e, &ref->as_mv, nmvc);
-  vp9_encode_nmv_fp(bc, &e, &ref->as_mv, nmvc, usehp);
-}
-
-#if CONFIG_NEW_MVREF
-static void vp9_write_mv_ref_id(vp9_writer *w,
-                                vp9_prob * ref_id_probs,
-                                int mv_ref_id) {
-  // Encode the index for the MV reference.
-  switch (mv_ref_id) {
-    case 0:
-      vp9_write(w, 0, ref_id_probs[0]);
-      break;
-    case 1:
-      vp9_write(w, 1, ref_id_probs[0]);
-      vp9_write(w, 0, ref_id_probs[1]);
-      break;
-    case 2:
-      vp9_write(w, 1, ref_id_probs[0]);
-      vp9_write(w, 1, ref_id_probs[1]);
-      vp9_write(w, 0, ref_id_probs[2]);
-      break;
-    case 3:
-      vp9_write(w, 1, ref_id_probs[0]);
-      vp9_write(w, 1, ref_id_probs[1]);
-      vp9_write(w, 1, ref_id_probs[2]);
-      break;
-
-      // TRAP.. This should not happen
-    default:
-      assert(0);
-      break;
-  }
-}
-#endif
-
 // This function writes the current macro block's segnment id to the bitstream
 // It should only be called if a segment map update is indicated.
 static void write_mb_segid(vp9_writer *bc,
                            const MB_MODE_INFO *mi, const MACROBLOCKD *xd) {
-  // Encode the MB segment id.
-  int seg_id = mi->segment_id;
-
-  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
-    switch (seg_id) {
-      case 0:
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);
-        break;
-      case 1:
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[1]);
-        break;
-      case 2:
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[2]);
-        break;
-      case 3:
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 1, xd->mb_segment_tree_probs[2]);
-        break;
-
-        // TRAP.. This should not happen
-      default:
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[0]);
-        vp9_write(bc, 0, xd->mb_segment_tree_probs[1]);
-        break;
-    }
-  }
+  if (xd->segmentation_enabled && xd->update_mb_segmentation_map)
+    treed_write(bc, vp9_segment_tree, xd->mb_segment_tree_probs,
+                mi->segment_id, 3);
 }
 
-static void write_mb_segid_except(VP9_COMMON *cm,
-                                  vp9_writer *bc,
-                                  const MB_MODE_INFO *mi,
-                                  const MACROBLOCKD *xd,
-                                  int mb_row, int mb_col) {
-  // Encode the MB segment id.
-  int seg_id = mi->segment_id;
-  int pred_seg_id = vp9_get_pred_mb_segid(cm, xd,
-                                          mb_row * cm->mb_cols + mb_col);
-  const vp9_prob *p = xd->mb_segment_tree_probs;
-  const vp9_prob p1 = xd->mb_segment_mispred_tree_probs[pred_seg_id];
-
-  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
-    vp9_write(bc, seg_id >= 2, p1);
-    if (pred_seg_id >= 2 && seg_id < 2) {
-      vp9_write(bc, seg_id == 1, p[1]);
-    } else if (pred_seg_id < 2 && seg_id >= 2) {
-      vp9_write(bc, seg_id == 3, p[2]);
-    }
-  }
-}
-
 // This function encodes the reference frame
-static void encode_ref_frame(vp9_writer *const bc,
-                             VP9_COMMON *const cm,
-                             MACROBLOCKD *xd,
-                             int segment_id,
-                             MV_REFERENCE_FRAME rf) {
-  int seg_ref_active;
-  int seg_ref_count = 0;
-  seg_ref_active = vp9_segfeature_active(xd,
-                                         segment_id,
-                                         SEG_LVL_REF_FRAME);
-
-  if (seg_ref_active) {
-    seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME) +
-                    vp9_check_segref(xd, segment_id, LAST_FRAME) +
-                    vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +
-                    vp9_check_segref(xd, segment_id, ALTREF_FRAME);
-  }
-
+static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
+  VP9_COMMON *const pc = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mi = &xd->mode_info_context->mbmi;
+  const int segment_id = mi->segment_id;
+  int seg_ref_active = vp9_segfeature_active(xd, segment_id,
+                                             SEG_LVL_REF_FRAME);
   // If segment level coding of this signal is disabled...
   // or the segment allows multiple reference frame options
-  if (!seg_ref_active || (seg_ref_count > 1)) {
-    // Values used in prediction model coding
-    unsigned char prediction_flag;
-    vp9_prob pred_prob;
-    MV_REFERENCE_FRAME pred_rf;
+  if (!seg_ref_active) {
+    // does the feature use compound prediction or not
+    // (if not specified at the frame/segment level)
+    if (pc->comp_pred_mode == HYBRID_PREDICTION) {
+      vp9_write(bc, mi->ref_frame[1] > INTRA_FRAME,
+                vp9_get_pred_prob(pc, xd, PRED_COMP_INTER_INTER));
+    } else {
+      assert((mi->ref_frame[1] <= INTRA_FRAME) ==
+                 (pc->comp_pred_mode == SINGLE_PREDICTION_ONLY));
+    }
 
-    // Get the context probability the prediction flag
-    pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);
-
-    // Get the predicted value.
-    pred_rf = vp9_get_pred_ref(cm, xd);
-
-    // Did the chosen reference frame match its predicted value.
-    prediction_flag =
-      (xd->mode_info_context->mbmi.ref_frame == pred_rf);
-
-    vp9_set_pred_flag(xd, PRED_REF, prediction_flag);
-    vp9_write(bc, prediction_flag, pred_prob);
-
-    // If not predicted correctly then code value explicitly
-    if (!prediction_flag) {
-      vp9_prob mod_refprobs[PREDICTION_PROBS];
-
-      vpx_memcpy(mod_refprobs,
-                 cm->mod_refprobs[pred_rf], sizeof(mod_refprobs));
-
-      // If segment coding enabled blank out options that cant occur by
-      // setting the branch probability to 0.
-      if (seg_ref_active) {
-        mod_refprobs[INTRA_FRAME] *=
-          vp9_check_segref(xd, segment_id, INTRA_FRAME);
-        mod_refprobs[LAST_FRAME] *=
-          vp9_check_segref(xd, segment_id, LAST_FRAME);
-        mod_refprobs[GOLDEN_FRAME] *=
-          (vp9_check_segref(xd, segment_id, GOLDEN_FRAME) *
-           vp9_check_segref(xd, segment_id, ALTREF_FRAME));
-      }
-
-      if (mod_refprobs[0]) {
-        vp9_write(bc, (rf != INTRA_FRAME), mod_refprobs[0]);
-      }
-
-      // Inter coded
-      if (rf != INTRA_FRAME) {
-        if (mod_refprobs[1]) {
-          vp9_write(bc, (rf != LAST_FRAME), mod_refprobs[1]);
-        }
-
-        if (rf != LAST_FRAME) {
-          if (mod_refprobs[2]) {
-            vp9_write(bc, (rf != GOLDEN_FRAME), mod_refprobs[2]);
-          }
-        }
-      }
+    if (mi->ref_frame[1] > INTRA_FRAME) {
+      vp9_write(bc, mi->ref_frame[0] == GOLDEN_FRAME,
+                vp9_get_pred_prob(pc, xd, PRED_COMP_REF_P));
+    } else {
+      vp9_write(bc, mi->ref_frame[0] != LAST_FRAME,
+                vp9_get_pred_prob(pc, xd, PRED_SINGLE_REF_P1));
+      if (mi->ref_frame[0] != LAST_FRAME)
+        vp9_write(bc, mi->ref_frame[0] != GOLDEN_FRAME,
+                  vp9_get_pred_prob(pc, xd, PRED_SINGLE_REF_P2));
     }
+  } else {
+    assert(mi->ref_frame[1] <= INTRA_FRAME);
+    assert(vp9_get_segdata(xd, segment_id, SEG_LVL_REF_FRAME) ==
+           mi->ref_frame[0]);
   }
 
   // if using the prediction mdoel we have nothing further to do because
@@ -733,51 +623,21 @@
   // the reference frame is fully coded by the segment
 }
 
-// Update the probabilities used to encode reference frame data
-static void update_ref_probs(VP9_COMP *const cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-
-  const int *const rfct = cpi->count_mb_ref_frame_usage;
-  const int rf_intra = rfct[INTRA_FRAME];
-  const int rf_inter = rfct[LAST_FRAME] +
-                       rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
-
-  cm->prob_intra_coded = get_binary_prob(rf_intra, rf_inter);
-  cm->prob_last_coded = get_prob(rfct[LAST_FRAME], rf_inter);
-  cm->prob_gf_coded = get_binary_prob(rfct[GOLDEN_FRAME], rfct[ALTREF_FRAME]);
-
-  // Compute a modified set of probabilities to use when prediction of the
-  // reference frame fails
-  vp9_compute_mod_refprobs(cm);
-}
-
 static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
-                                vp9_writer *bc,
-                                int mb_rows_left, int mb_cols_left) {
+                                vp9_writer *bc, int mi_row, int mi_col) {
   VP9_COMMON *const pc = &cpi->common;
   const nmv_context *nmvc = &pc->fc.nmvc;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int mis = pc->mode_info_stride;
   MB_MODE_INFO *const mi = &m->mbmi;
-  const MV_REFERENCE_FRAME rf = mi->ref_frame;
+  const MV_REFERENCE_FRAME rf = mi->ref_frame[0];
   const MB_PREDICTION_MODE mode = mi->mode;
   const int segment_id = mi->segment_id;
-  const int mb_size = 1 << mi->sb_type;
   int skip_coeff;
 
-  int mb_row = pc->mb_rows - mb_rows_left;
-  int mb_col = pc->mb_cols - mb_cols_left;
   xd->prev_mode_info_context = pc->prev_mi + (m - pc->mi);
   x->partition_info = x->pi + (m - pc->mi);
 
-  // Distance of Mb to the various image edges.
-  // These specified to 8th pel as they are always compared to MV
-  // values that are in 1/8th pel units
-
-  set_mb_row(pc, xd, mb_row, mb_size);
-  set_mb_col(pc, xd, mb_col, mb_size);
-
 #ifdef ENTROPY_STATS
   active_section = 9;
 #endif
@@ -793,7 +653,7 @@
 
       // If the mb segment id wasn't predicted code explicitly
       if (!prediction_flag)
-        write_mb_segid_except(pc, bc, mi, &cpi->mb.e_mbd, mb_row, mb_col);
+        write_mb_segid(bc, mi, &cpi->mb.e_mbd);
     } else {
       // Normal unpredicted coding
       write_mb_segid(bc, mi, &cpi->mb.e_mbd);
@@ -800,9 +660,7 @@
     }
   }
 
-  if (!pc->mb_no_coeff_skip) {
-    skip_coeff = 0;
-  } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
     skip_coeff = 1;
   } else {
     skip_coeff = m->mbmi.mb_skip_coeff;
@@ -810,42 +668,50 @@
               vp9_get_pred_prob(pc, xd, PRED_MBSKIP));
   }
 
-  // Encode the reference frame.
-  encode_ref_frame(bc, pc, xd, segment_id, rf);
+  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME))
+    vp9_write(bc, rf != INTRA_FRAME,
+              vp9_get_pred_prob(pc, xd, PRED_INTRA_INTER));
 
+  if (mi->sb_type >= BLOCK_SIZE_SB8X8 && pc->txfm_mode == TX_MODE_SELECT &&
+      !(rf != INTRA_FRAME &&
+        (skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
+    TX_SIZE sz = mi->txfm_size;
+    const vp9_prob *tx_probs = vp9_get_pred_probs(pc, xd, PRED_TX_SIZE);
+    vp9_write(bc, sz != TX_4X4, tx_probs[0]);
+    if (mi->sb_type >= BLOCK_SIZE_MB16X16 && sz != TX_4X4) {
+      vp9_write(bc, sz != TX_8X8, tx_probs[1]);
+      if (mi->sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)
+        vp9_write(bc, sz != TX_16X16, tx_probs[2]);
+    }
+  }
+
   if (rf == INTRA_FRAME) {
 #ifdef ENTROPY_STATS
     active_section = 6;
 #endif
 
-    if (m->mbmi.sb_type)
-      write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);
-    else
-      write_ymode(bc, mode, pc->fc.ymode_prob);
-
-    if (mode == B_PRED) {
-      int j = 0;
-      do {
-        write_bmode(bc, m->bmi[j].as_mode.first,
-                    pc->fc.bmode_prob);
-      } while (++j < 16);
-    }
-    if (mode == I8X8_PRED) {
-      write_i8x8_mode(bc, m->bmi[0].as_mode.first,
-                      pc->fc.i8x8_mode_prob);
-      write_i8x8_mode(bc, m->bmi[2].as_mode.first,
-                      pc->fc.i8x8_mode_prob);
-      write_i8x8_mode(bc, m->bmi[8].as_mode.first,
-                      pc->fc.i8x8_mode_prob);
-      write_i8x8_mode(bc, m->bmi[10].as_mode.first,
-                      pc->fc.i8x8_mode_prob);
+    if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
+      const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+      const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
+      const int bsl = MIN(bwl, bhl);
+      write_intra_mode(bc, mode, pc->fc.y_mode_prob[MIN(3, bsl)]);
     } else {
-      write_uv_mode(bc, mi->uv_mode,
-                    pc->fc.uv_mode_prob[mode]);
+      int idx, idy;
+      int bw = 1 << b_width_log2(mi->sb_type);
+      int bh = 1 << b_height_log2(mi->sb_type);
+      for (idy = 0; idy < 2; idy += bh)
+        for (idx = 0; idx < 2; idx += bw) {
+          MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode.first;
+          write_intra_mode(bc, bm, pc->fc.y_mode_prob[0]);
+        }
     }
+    write_intra_mode(bc, mi->uv_mode,
+                     pc->fc.uv_mode_prob[mode]);
   } else {
-    vp9_prob mv_ref_p[VP9_MVREFS - 1];
+    vp9_prob mv_ref_p[VP9_INTER_MODES - 1];
 
+    encode_ref_frame(cpi, bc);
+
     vp9_mv_ref_probs(&cpi->common, mv_ref_p, mi->mb_mode_context[rf]);
 
 #ifdef ENTROPY_STATS
@@ -854,156 +720,63 @@
 
     // If segment skip is not enabled code the mode.
     if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
-      if (mi->sb_type) {
+      if (mi->sb_type >= BLOCK_SIZE_SB8X8) {
         write_sb_mv_ref(bc, mode, mv_ref_p);
-      } else {
-        write_mv_ref(bc, mode, mv_ref_p);
+        vp9_accum_mv_refs(&cpi->common, mode, mi->mb_mode_context[rf]);
       }
-      vp9_accum_mv_refs(&cpi->common, mode, mi->mb_mode_context[rf]);
     }
 
-    if (mode >= NEARESTMV && mode <= SPLITMV) {
-      if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-        write_token(bc, vp9_switchable_interp_tree,
-                    vp9_get_pred_probs(&cpi->common, xd,
-                                       PRED_SWITCHABLE_INTERP),
-                    vp9_switchable_interp_encodings +
-                    vp9_switchable_interp_map[mi->interp_filter]);
-      } else {
-        assert(mi->interp_filter == cpi->common.mcomp_filter_type);
-      }
+    if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+      write_token(bc, vp9_switchable_interp_tree,
+                  vp9_get_pred_probs(&cpi->common, xd,
+                                     PRED_SWITCHABLE_INTERP),
+                  vp9_switchable_interp_encodings +
+                  vp9_switchable_interp_map[mi->interp_filter]);
+    } else {
+      assert(mi->interp_filter == cpi->common.mcomp_filter_type);
     }
 
-    // does the feature use compound prediction or not
-    // (if not specified at the frame/segment level)
-    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
-      vp9_write(bc, mi->second_ref_frame > INTRA_FRAME,
-                vp9_get_pred_prob(pc, xd, PRED_COMP));
-    }
-#if CONFIG_COMP_INTERINTRA_PRED
-    if (cpi->common.use_interintra &&
-        mode >= NEARESTMV && mode < SPLITMV &&
-        mi->second_ref_frame <= INTRA_FRAME) {
-      vp9_write(bc, mi->second_ref_frame == INTRA_FRAME,
-                pc->fc.interintra_prob);
-      // if (!cpi->dummy_packing)
-      //   printf("-- %d (%d)\n", mi->second_ref_frame == INTRA_FRAME,
-      //          pc->fc.interintra_prob);
-      if (mi->second_ref_frame == INTRA_FRAME) {
-        // if (!cpi->dummy_packing)
-        //   printf("** %d %d\n", mi->interintra_mode,
-        // mi->interintra_uv_mode);
-        write_ymode(bc, mi->interintra_mode, pc->fc.ymode_prob);
-#if SEPARATE_INTERINTRA_UV
-        write_uv_mode(bc, mi->interintra_uv_mode,
-                      pc->fc.uv_mode_prob[mi->interintra_mode]);
-#endif
-      }
-    }
-#endif
-
-#if CONFIG_NEW_MVREF
-    // if ((mode == NEWMV) || (mode == SPLITMV)) {
-    if (mode == NEWMV) {
-      // Encode the index of the choice.
-      vp9_write_mv_ref_id(bc,
-                          xd->mb_mv_ref_probs[rf], mi->best_index);
-
-      if (mi->second_ref_frame > 0) {
-        // Encode the index of the choice.
-        vp9_write_mv_ref_id(
-                            bc, xd->mb_mv_ref_probs[mi->second_ref_frame],
-                            mi->best_second_index);
-      }
-    }
-#endif
-
-    switch (mode) { /* new, split require MVs */
-      case NEWMV:
-#ifdef ENTROPY_STATS
-        active_section = 5;
-#endif
-        write_nmv(cpi, bc, &mi->mv[0].as_mv, &mi->best_mv,
-                  (const nmv_context*) nmvc,
-                  xd->allow_high_precision_mv);
-
-        if (mi->second_ref_frame > 0) {
-          write_nmv(cpi, bc, &mi->mv[1].as_mv, &mi->best_second_mv,
-                    (const nmv_context*) nmvc,
-                    xd->allow_high_precision_mv);
-        }
-        break;
-      case SPLITMV: {
-        int j = 0;
-
-#ifdef MODE_STATS
-        ++count_mb_seg[mi->partitioning];
-#endif
-
-        write_split(bc, mi->partitioning, cpi->common.fc.mbsplit_prob);
-        cpi->mbsplit_count[mi->partitioning]++;
-
-        do {
-          B_PREDICTION_MODE blockmode;
-          int_mv blockmv;
-          const int *const  L = vp9_mbsplits[mi->partitioning];
-          int k = -1;  /* first block in subset j */
-          int mv_contz;
-          int_mv leftmv, abovemv;
-
+    if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+      int j;
+      MB_PREDICTION_MODE blockmode;
+      int_mv blockmv;
+      int bwl = b_width_log2(mi->sb_type), bw = 1 << bwl;
+      int bhl = b_height_log2(mi->sb_type), bh = 1 << bhl;
+      int idx, idy;
+      for (idy = 0; idy < 2; idy += bh) {
+        for (idx = 0; idx < 2; idx += bw) {
+          j = idy * 2 + idx;
           blockmode = cpi->mb.partition_info->bmi[j].mode;
           blockmv = cpi->mb.partition_info->bmi[j].mv;
-#if CONFIG_DEBUG
-          while (j != L[++k])
-            if (k >= 16)
-              assert(0);
-#else
-          while (j != L[++k]);
-#endif
-          leftmv.as_int = left_block_mv(xd, m, k);
-          abovemv.as_int = above_block_mv(m, k, mis);
-          mv_contz = vp9_mv_cont(&leftmv, &abovemv);
-
-          write_sub_mv_ref(bc, blockmode,
-                           cpi->common.fc.sub_mv_ref_prob[mv_contz]);
-          cpi->sub_mv_ref_count[mv_contz][blockmode - LEFT4X4]++;
-          if (blockmode == NEW4X4) {
+          write_sb_mv_ref(bc, blockmode, mv_ref_p);
+          vp9_accum_mv_refs(&cpi->common, blockmode, mi->mb_mode_context[rf]);
+          if (blockmode == NEWMV) {
 #ifdef ENTROPY_STATS
             active_section = 11;
 #endif
-            write_nmv(cpi, bc, &blockmv.as_mv, &mi->best_mv,
-                      (const nmv_context*) nmvc,
-                      xd->allow_high_precision_mv);
+            vp9_encode_mv(bc, &blockmv.as_mv, &mi->best_mv.as_mv,
+                          nmvc, xd->allow_high_precision_mv);
 
-            if (mi->second_ref_frame > 0) {
-              write_nmv(cpi, bc,
-                        &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
-                        &mi->best_second_mv,
-                        (const nmv_context*) nmvc,
-                        xd->allow_high_precision_mv);
-            }
+            if (mi->ref_frame[1] > INTRA_FRAME)
+              vp9_encode_mv(bc,
+                            &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
+                            &mi->best_second_mv.as_mv,
+                            nmvc, xd->allow_high_precision_mv);
           }
-        } while (++j < cpi->mb.partition_info->count);
-        break;
+        }
       }
-      default:
-        break;
-    }
-  }
+    } else if (mode == NEWMV) {
+#ifdef ENTROPY_STATS
+      active_section = 5;
+#endif
+      vp9_encode_mv(bc,
+                    &mi->mv[0].as_mv, &mi->best_mv.as_mv,
+                    nmvc, xd->allow_high_precision_mv);
 
-  if (((rf == INTRA_FRAME && mode <= I8X8_PRED) ||
-       (rf != INTRA_FRAME && !(mode == SPLITMV &&
-                               mi->partitioning == PARTITIONING_4X4))) &&
-      pc->txfm_mode == TX_MODE_SELECT &&
-      !((pc->mb_no_coeff_skip && skip_coeff) ||
-        (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
-    TX_SIZE sz = mi->txfm_size;
-    // FIXME(rbultje) code ternary symbol once all experiments are merged
-    vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
-    if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV) {
-      vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]);
-      if (mi->sb_type && sz != TX_8X8)
-        vp9_write(bc, sz != TX_16X16, pc->prob_tx[2]);
+      if (mi->ref_frame[1] > INTRA_FRAME)
+        vp9_encode_mv(bc,
+                      &mi->mv[1].as_mv, &mi->best_second_mv.as_mv,
+                      nmvc, xd->allow_high_precision_mv);
     }
   }
 }
@@ -1010,726 +783,206 @@
 
 static void write_mb_modes_kf(const VP9_COMP *cpi,
                               MODE_INFO *m,
-                              vp9_writer *bc,
-                              int mb_rows_left, int mb_cols_left) {
+                              vp9_writer *bc, int mi_row, int mi_col) {
   const VP9_COMMON *const c = &cpi->common;
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  const int mis = c->mode_info_stride;
   const int ym = m->mbmi.mode;
+  const int mis = c->mode_info_stride;
   const int segment_id = m->mbmi.segment_id;
   int skip_coeff;
 
-  if (xd->update_mb_segmentation_map) {
+  if (xd->update_mb_segmentation_map)
     write_mb_segid(bc, &m->mbmi, xd);
-  }
 
-  if (!c->mb_no_coeff_skip) {
-    skip_coeff = 0;
-  } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
     skip_coeff = 1;
   } else {
     skip_coeff = m->mbmi.mb_skip_coeff;
-    vp9_write(bc, skip_coeff,
-              vp9_get_pred_prob(c, xd, PRED_MBSKIP));
+    vp9_write(bc, skip_coeff, vp9_get_pred_prob(c, xd, PRED_MBSKIP));
   }
 
-  if (m->mbmi.sb_type) {
-    sb_kfwrite_ymode(bc, ym,
-                     c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
-  } else {
-    kfwrite_ymode(bc, ym,
-                  c->kf_ymode_prob[c->kf_ymode_probs_index]);
+  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8 && c->txfm_mode == TX_MODE_SELECT) {
+    TX_SIZE sz = m->mbmi.txfm_size;
+    const vp9_prob *tx_probs = vp9_get_pred_probs(c, xd, PRED_TX_SIZE);
+    vp9_write(bc, sz != TX_4X4, tx_probs[0]);
+    if (m->mbmi.sb_type >= BLOCK_SIZE_MB16X16 && sz != TX_4X4) {
+      vp9_write(bc, sz != TX_8X8, tx_probs[1]);
+      if (m->mbmi.sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)
+        vp9_write(bc, sz != TX_16X16, tx_probs[2]);
+    }
   }
 
-  if (ym == B_PRED) {
-    int i = 0;
-    do {
-      const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
-      const B_PREDICTION_MODE L = (xd->left_available || (i & 3)) ?
-                                  left_block_mode(m, i) : B_DC_PRED;
-      const int bm = m->bmi[i].as_mode.first;
-
+  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
+    const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis);
+    const MB_PREDICTION_MODE L = xd->left_available ?
+                                 left_block_mode(m, 0) : DC_PRED;
+    write_intra_mode(bc, ym, c->kf_y_mode_prob[A][L]);
+  } else {
+    int idx, idy;
+    int bw = 1 << b_width_log2(m->mbmi.sb_type);
+    int bh = 1 << b_height_log2(m->mbmi.sb_type);
+    for (idy = 0; idy < 2; idy += bh) {
+      for (idx = 0; idx < 2; idx += bw) {
+        int i = idy * 2 + idx;
+        const MB_PREDICTION_MODE A = above_block_mode(m, i, mis);
+        const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
+                                     left_block_mode(m, i) : DC_PRED;
+        const int bm = m->bmi[i].as_mode.first;
 #ifdef ENTROPY_STATS
-      ++intra_mode_stats [A] [L] [bm];
+        ++intra_mode_stats[A][L][bm];
 #endif
-
-      write_kf_bmode(bc, bm, c->kf_bmode_prob[A][L]);
-    } while (++i < 16);
-  }
-  if (ym == I8X8_PRED) {
-    write_i8x8_mode(bc, m->bmi[0].as_mode.first,
-                    c->fc.i8x8_mode_prob);
-    // printf("    mode: %d\n", m->bmi[0].as_mode.first); fflush(stdout);
-    write_i8x8_mode(bc, m->bmi[2].as_mode.first,
-                    c->fc.i8x8_mode_prob);
-    // printf("    mode: %d\n", m->bmi[2].as_mode.first); fflush(stdout);
-    write_i8x8_mode(bc, m->bmi[8].as_mode.first,
-                    c->fc.i8x8_mode_prob);
-    // printf("    mode: %d\n", m->bmi[8].as_mode.first); fflush(stdout);
-    write_i8x8_mode(bc, m->bmi[10].as_mode.first,
-                    c->fc.i8x8_mode_prob);
-    // printf("    mode: %d\n", m->bmi[10].as_mode.first); fflush(stdout);
-  } else
-    write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
-
-  if (ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&
-      !((c->mb_no_coeff_skip && skip_coeff) ||
-        (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
-    TX_SIZE sz = m->mbmi.txfm_size;
-    // FIXME(rbultje) code ternary symbol once all experiments are merged
-    vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);
-    if (sz != TX_4X4 && ym <= TM_PRED) {
-      vp9_write(bc, sz != TX_8X8, c->prob_tx[1]);
-      if (m->mbmi.sb_type && sz != TX_8X8)
-        vp9_write(bc, sz != TX_16X16, c->prob_tx[2]);
+        write_intra_mode(bc, bm, c->kf_y_mode_prob[A][L]);
+      }
     }
   }
+
+  write_intra_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
 }
 
-#if CONFIG_CODE_NONZEROCOUNT
-static void write_nzc(VP9_COMMON *const cm,
-                      uint16_t nzc,
-                      int nzc_context,
-                      TX_SIZE tx_size,
-                      int ref,
-                      int type,
-                      vp9_writer* const bc) {
-  int c, e;
-  c = codenzc(nzc);
-  if (tx_size == TX_32X32) {
-    write_token(bc, vp9_nzc32x32_tree,
-                cm->fc.nzc_probs_32x32[nzc_context][ref][type],
-                vp9_nzc32x32_encodings + c);
-    // cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++;
-  } else if (tx_size == TX_16X16) {
-    write_token(bc, vp9_nzc16x16_tree,
-                cm->fc.nzc_probs_16x16[nzc_context][ref][type],
-                vp9_nzc16x16_encodings + c);
-    // cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++;
-  } else if (tx_size == TX_8X8) {
-    write_token(bc, vp9_nzc8x8_tree,
-                cm->fc.nzc_probs_8x8[nzc_context][ref][type],
-                vp9_nzc8x8_encodings + c);
-    // cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++;
-  } else if (tx_size == TX_4X4) {
-    write_token(bc, vp9_nzc4x4_tree,
-                cm->fc.nzc_probs_4x4[nzc_context][ref][type],
-                vp9_nzc4x4_encodings + c);
-    // cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++;
+static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
+                          TOKENEXTRA **tok, TOKENEXTRA *tok_end,
+                          int mi_row, int mi_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+
+  if (m->mbmi.sb_type < BLOCK_SIZE_SB8X8)
+    if (xd->ab_index > 0)
+      return;
+  xd->mode_info_context = m;
+  set_mi_row_col(&cpi->common, xd, mi_row,
+                 1 << mi_height_log2(m->mbmi.sb_type),
+                 mi_col, 1 << mi_width_log2(m->mbmi.sb_type));
+  if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {
+    write_mb_modes_kf(cpi, m, bc, mi_row, mi_col);
+#ifdef ENTROPY_STATS
+    active_section = 8;
+#endif
   } else {
-    assert(0);
+    pack_inter_mode_mvs(cpi, m, bc, mi_row, mi_col);
+#ifdef ENTROPY_STATS
+    active_section = 1;
+#endif
   }
 
-  if ((e = vp9_extranzcbits[c])) {
-    int x = nzc - vp9_basenzcvalue[c];
-    while (e--) {
-      int b = (x >> e) & 1;
-      vp9_write(bc, b,
-                cm->fc.nzc_pcat_probs[nzc_context][c - NZC_TOKENS_NOEXTRA][e]);
-      // cm->fc.nzc_pcat_counts[nzc_context][c - NZC_TOKENS_NOEXTRA][e][b]++;
-    }
-  }
+  assert(*tok < tok_end);
+  pack_mb_tokens(bc, tok, tok_end);
 }
 
-static void write_nzcs_sb64(VP9_COMP *cpi,
-                            MACROBLOCKD *xd,
-                            int mb_row,
-                            int mb_col,
-                            vp9_writer* const bc) {
+static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
+                           TOKENEXTRA **tok, TOKENEXTRA *tok_end,
+                           int mi_row, int mi_col,
+                           BLOCK_SIZE_TYPE bsize) {
   VP9_COMMON *const cm = &cpi->common;
-  MODE_INFO *m = xd->mode_info_context;
-  MB_MODE_INFO *const mi = &m->mbmi;
-  int j, nzc_context;
-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  const int mis = cm->mode_info_stride;
+  int bwl, bhl;
+  int bsl = b_width_log2(bsize);
+  int bs = (1 << bsl) / 4;  // mode_info step for subsize
+  int n;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE_TYPE subsize;
 
-  assert(mb_col == get_mb_col(xd));
-  assert(mb_row == get_mb_row(xd));
-
-  if (mi->mb_skip_coeff)
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  switch (mi->txfm_size) {
-    case TX_32X32:
-      for (j = 0; j < 256; j += 64) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0, bc);
-      }
-      for (j = 256; j < 384; j += 64) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 1, bc);
-      }
-      break;
+  bwl = b_width_log2(m->mbmi.sb_type);
+  bhl = b_height_log2(m->mbmi.sb_type);
 
-    case TX_16X16:
-      for (j = 0; j < 256; j += 16) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc);
-      }
-      for (j = 256; j < 384; j += 16) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc);
-      }
-      break;
+  // parse the partition type
+  if ((bwl == bsl) && (bhl == bsl))
+    partition = PARTITION_NONE;
+  else if ((bwl == bsl) && (bhl < bsl))
+    partition = PARTITION_HORZ;
+  else if ((bwl < bsl) && (bhl == bsl))
+    partition = PARTITION_VERT;
+  else if ((bwl < bsl) && (bhl < bsl))
+    partition = PARTITION_SPLIT;
+  else
+    assert(0);
 
-    case TX_8X8:
-      for (j = 0; j < 256; j += 4) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc);
-      }
-      for (j = 256; j < 384; j += 4) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);
-      }
-      break;
+  if (bsize < BLOCK_SIZE_SB8X8)
+    if (xd->ab_index > 0)
+      return;
 
-    case TX_4X4:
-      for (j = 0; j < 256; ++j) {
-        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc);
-      }
-      for (j = 256; j < 384; ++j) {
-        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);
-      }
-      break;
-
-    default:
-      break;
+  if (bsize >= BLOCK_SIZE_SB8X8) {
+    int pl;
+    int idx = check_bsize_coverage(cm, xd, mi_row, mi_col, bsize);
+    xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
+    xd->above_seg_context = cm->above_seg_context + mi_col;
+    pl = partition_plane_context(xd, bsize);
+    // encode the partition information
+    if (idx == 0)
+      write_token(bc, vp9_partition_tree,
+                  cm->fc.partition_prob[cm->frame_type][pl],
+                  vp9_partition_encodings + partition);
+    else if (idx > 0)
+      vp9_write(bc, partition == PARTITION_SPLIT,
+                cm->fc.partition_prob[cm->frame_type][pl][idx]);
   }
-}
 
-static void write_nzcs_sb32(VP9_COMP *cpi,
-                            MACROBLOCKD *xd,
-                            int mb_row,
-                            int mb_col,
-                            vp9_writer* const bc) {
-  VP9_COMMON *const cm = &cpi->common;
-  MODE_INFO *m = xd->mode_info_context;
-  MB_MODE_INFO *const mi = &m->mbmi;
-  int j, nzc_context;
-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
+  subsize = get_subsize(bsize, partition);
+  *(get_sb_index(xd, subsize)) = 0;
 
-  assert(mb_col == get_mb_col(xd));
-  assert(mb_row == get_mb_row(xd));
-
-  if (mi->mb_skip_coeff)
-    return;
-
-  switch (mi->txfm_size) {
-    case TX_32X32:
-      for (j = 0; j < 64; j += 64) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0, bc);
-      }
-      for (j = 64; j < 96; j += 16) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc);
-      }
+  switch (partition) {
+    case PARTITION_NONE:
+      write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col);
       break;
-
-    case TX_16X16:
-      for (j = 0; j < 64; j += 16) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc);
-      }
-      for (j = 64; j < 96; j += 16) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc);
-      }
+    case PARTITION_HORZ:
+      write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col);
+      *(get_sb_index(xd, subsize)) = 1;
+      if ((mi_row + bs) < cm->mi_rows)
+        write_modes_b(cpi, m + bs * mis, bc, tok, tok_end, mi_row + bs, mi_col);
       break;
-
-    case TX_8X8:
-      for (j = 0; j < 64; j += 4) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc);
-      }
-      for (j = 64; j < 96; j += 4) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);
-      }
+    case PARTITION_VERT:
+      write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col);
+      *(get_sb_index(xd, subsize)) = 1;
+      if ((mi_col + bs) < cm->mi_cols)
+        write_modes_b(cpi, m + bs, bc, tok, tok_end, mi_row, mi_col + bs);
       break;
-
-    case TX_4X4:
-      for (j = 0; j < 64; ++j) {
-        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc);
+    case PARTITION_SPLIT:
+      for (n = 0; n < 4; n++) {
+        int j = n >> 1, i = n & 0x01;
+        *(get_sb_index(xd, subsize)) = n;
+        write_modes_sb(cpi, m + j * bs * mis + i * bs, bc, tok, tok_end,
+                       mi_row + j * bs, mi_col + i * bs, subsize);
       }
-      for (j = 64; j < 96; ++j) {
-        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);
-      }
       break;
-
     default:
-      break;
+      assert(0);
   }
-}
 
-static void write_nzcs_mb16(VP9_COMP *cpi,
-                            MACROBLOCKD *xd,
-                            int mb_row,
-                            int mb_col,
-                            vp9_writer* const bc) {
-  VP9_COMMON *const cm = &cpi->common;
-  MODE_INFO *m = xd->mode_info_context;
-  MB_MODE_INFO *const mi = &m->mbmi;
-  int j, nzc_context;
-  const int ref = m->mbmi.ref_frame != INTRA_FRAME;
-
-  assert(mb_col == get_mb_col(xd));
-  assert(mb_row == get_mb_row(xd));
-
-  if (mi->mb_skip_coeff)
-    return;
-
-  switch (mi->txfm_size) {
-    case TX_16X16:
-      for (j = 0; j < 16; j += 16) {
-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc);
-      }
-      for (j = 16; j < 24; j += 4) {
-        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);
-      }
-      break;
-
-    case TX_8X8:
-      for (j = 0; j < 16; j += 4) {
-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc);
-      }
-      if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) {
-        for (j = 16; j < 24; ++j) {
-          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-          write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);
-        }
-      } else {
-        for (j = 16; j < 24; j += 4) {
-          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-          write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);
-        }
-      }
-      break;
-
-    case TX_4X4:
-      for (j = 0; j < 16; ++j) {
-        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc);
-      }
-      for (j = 16; j < 24; ++j) {
-        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);
-        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);
-      }
-      break;
-
-    default:
-      break;
+  // update partition context
+  if (bsize >= BLOCK_SIZE_SB8X8 &&
+      (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    update_partition_context(xd, subsize, bsize);
   }
 }
 
-#ifdef NZC_STATS
-void init_nzcstats() {
-  vp9_zero(nzc_stats_4x4);
-  vp9_zero(nzc_stats_8x8);
-  vp9_zero(nzc_stats_16x16);
-  vp9_zero(nzc_stats_32x32);
-  vp9_zero(nzc_pcat_stats);
-}
-
-void update_nzcstats(VP9_COMMON *const cm) {
-  int c, r, b, t;
-
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (r = 0; r < REF_TYPES; ++r) {
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        for (t = 0; t < NZC4X4_TOKENS; ++t) {
-          nzc_stats_4x4[c][r][b][t] += cm->fc.nzc_counts_4x4[c][r][b][t];
-        }
-      }
-    }
-  }
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (r = 0; r < REF_TYPES; ++r) {
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        for (t = 0; t < NZC8X8_TOKENS; ++t) {
-          nzc_stats_8x8[c][r][b][t] += cm->fc.nzc_counts_8x8[c][r][b][t];
-        }
-      }
-    }
-  }
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (r = 0; r < REF_TYPES; ++r) {
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        for (t = 0; t < NZC16X16_TOKENS; ++t) {
-          nzc_stats_16x16[c][r][b][t] += cm->fc.nzc_counts_16x16[c][r][b][t];
-        }
-      }
-    }
-  }
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (r = 0; r < REF_TYPES; ++r) {
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        for (t = 0; t < NZC32X32_TOKENS; ++t) {
-          nzc_stats_32x32[c][r][b][t] += cm->fc.nzc_counts_32x32[c][r][b][t];
-        }
-      }
-    }
-  }
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
-      int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];
-      for (b = 0; b < bits; ++b) {
-        nzc_pcat_stats[c][t][b][0] += cm->fc.nzc_pcat_counts[c][t][b][0];
-        nzc_pcat_stats[c][t][b][1] += cm->fc.nzc_pcat_counts[c][t][b][1];
-      }
-    }
-  }
-}
-
-void print_nzcstats() {
-  int c, r, b, t;
-  FILE *f;
-
-  printf(
-    "static const unsigned int default_nzc_counts_4x4[MAX_NZC_CONTEXTS]\n"
-    "                                                [REF_TYPES]\n"
-    "                                                [BLOCK_TYPES]\n"
-    "                                                [NZC4X4_TOKENS] = {\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    printf("  {\n");
-    for (r = 0; r < REF_TYPES; ++r) {
-      printf("    {\n");
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        printf("      {");
-        for (t = 0; t < NZC4X4_TOKENS; ++t) {
-          printf(" %-3d,", nzc_stats_4x4[c][r][b][t]);
-        }
-        printf(" },\n");
-      }
-      printf("    },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-
-  printf(
-    "static const unsigned int default_nzc_counts_8x8[MAX_NZC_CONTEXTS]\n"
-    "                                                [REF_TYPES]\n"
-    "                                                [BLOCK_TYPES]\n"
-    "                                                [NZC8X8_TOKENS] = {\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    printf("  {\n");
-    for (r = 0; r < REF_TYPES; ++r) {
-      printf("    {\n");
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        printf("      {");
-        for (t = 0; t < NZC8X8_TOKENS; ++t) {
-          printf(" %-3d,", nzc_stats_8x8[c][r][b][t]);
-        }
-        printf(" },\n");
-      }
-      printf("    },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-
-  printf(
-    "static const unsigned int default_nzc_counts_16x16[MAX_NZC_CONTEXTS]\n"
-    "                                                  [REF_TYPES]\n"
-    "                                                  [BLOCK_TYPES]\n"
-    "                                                  [NZC16X16_TOKENS] = {"
-    "\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    printf("  {\n");
-    for (r = 0; r < REF_TYPES; ++r) {
-      printf("    {\n");
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        printf("      {");
-        for (t = 0; t < NZC16X16_TOKENS; ++t) {
-          printf(" %-3d,", nzc_stats_16x16[c][r][b][t]);
-        }
-        printf(" },\n");
-      }
-      printf("    },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-
-  printf(
-    "static const unsigned int default_nzc_counts_32x32[MAX_NZC_CONTEXTS]\n"
-    "                                                  [REF_TYPES]\n"
-    "                                                  [BLOCK_TYPES]\n"
-    "                                                  [NZC32X32_TOKENS] = {"
-    "\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    printf("  {\n");
-    for (r = 0; r < REF_TYPES; ++r) {
-      printf("    {\n");
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        printf("      {");
-        for (t = 0; t < NZC32X32_TOKENS; ++t) {
-          printf(" %-3d,", nzc_stats_32x32[c][r][b][t]);
-        }
-        printf(" },\n");
-      }
-      printf("    },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-
-  printf(
-    "static const vp9_prob default_nzc_pcat_counts[MAX_NZC_CONTEXTS]\n"
-    "                                             [NZC_TOKENS_EXTRA]\n"
-    "                                             [NZC_BITS_EXTRA] = {\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    printf("  {\n");
-    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
-      printf("    {");
-      for (b = 0; b < NZC_BITS_EXTRA; ++b) {
-        printf(" %d/%d,",
-               nzc_pcat_stats[c][t][b][0], nzc_pcat_stats[c][t][b][1]);
-      }
-      printf(" },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-
-  printf(
-    "static const vp9_prob default_nzc_probs_4x4[MAX_NZC_CONTEXTS]\n"
-    "                                           [REF_TYPES]\n"
-    "                                           [BLOCK_TYPES]\n"
-    "                                           [NZC4X4_TOKENS] = {\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    printf("  {\n");
-    for (r = 0; r < REF_TYPES; ++r) {
-      printf("    {\n");
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        vp9_prob probs[NZC4X4_NODES];
-        unsigned int branch_ct[NZC4X4_NODES][2];
-        vp9_tree_probs_from_distribution(vp9_nzc4x4_tree,
-                                         probs, branch_ct,
-                                         nzc_stats_4x4[c][r][b], 0);
-        printf("      {");
-        for (t = 0; t < NZC4X4_NODES; ++t) {
-          printf(" %-3d,", probs[t]);
-        }
-        printf(" },\n");
-      }
-      printf("    },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-
-  printf(
-    "static const vp9_prob default_nzc_probs_8x8[MAX_NZC_CONTEXTS]\n"
-    "                                           [REF_TYPES]\n"
-    "                                           [BLOCK_TYPES]\n"
-    "                                           [NZC8X8_TOKENS] = {\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    printf("  {\n");
-    for (r = 0; r < REF_TYPES; ++r) {
-      printf("    {\n");
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        vp9_prob probs[NZC8X8_NODES];
-        unsigned int branch_ct[NZC8X8_NODES][2];
-        vp9_tree_probs_from_distribution(vp9_nzc8x8_tree,
-                                         probs, branch_ct,
-                                         nzc_stats_8x8[c][r][b], 0);
-        printf("      {");
-        for (t = 0; t < NZC8X8_NODES; ++t) {
-          printf(" %-3d,", probs[t]);
-        }
-        printf(" },\n");
-      }
-      printf("    },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-
-  printf(
-    "static const vp9_prob default_nzc_probs_16x16[MAX_NZC_CONTEXTS]\n"
-    "                                             [REF_TYPES]\n"
-    "                                             [BLOCK_TYPES]\n"
-    "                                             [NZC16X16_TOKENS] = {\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    printf("  {\n");
-    for (r = 0; r < REF_TYPES; ++r) {
-      printf("    {\n");
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        vp9_prob probs[NZC16X16_NODES];
-        unsigned int branch_ct[NZC16X16_NODES][2];
-        vp9_tree_probs_from_distribution(vp9_nzc16x16_tree,
-                                         probs, branch_ct,
-                                         nzc_stats_16x16[c][r][b], 0);
-        printf("      {");
-        for (t = 0; t < NZC16X16_NODES; ++t) {
-          printf(" %-3d,", probs[t]);
-        }
-        printf(" },\n");
-      }
-      printf("    },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-
-  printf(
-    "static const vp9_prob default_nzc_probs_32x32[MAX_NZC_CONTEXTS]\n"
-    "                                             [REF_TYPES]\n"
-    "                                             [BLOCK_TYPES]\n"
-    "                                             [NZC32X32_TOKENS] = {\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    printf("  {\n");
-    for (r = 0; r < REF_TYPES; ++r) {
-      printf("    {\n");
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        vp9_prob probs[NZC32X32_NODES];
-        unsigned int branch_ct[NZC32X32_NODES][2];
-        vp9_tree_probs_from_distribution(vp9_nzc32x32_tree,
-                                         probs, branch_ct,
-                                         nzc_stats_32x32[c][r][b], 0);
-        printf("      {");
-        for (t = 0; t < NZC32X32_NODES; ++t) {
-          printf(" %-3d,", probs[t]);
-        }
-        printf(" },\n");
-      }
-      printf("    },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-
-  printf(
-    "static const vp9_prob default_nzc_pcat_probs[MAX_NZC_CONTEXTS]\n"
-    "                                            [NZC_TOKENS_EXTRA]\n"
-    "                                            [NZC_BITS_EXTRA] = {\n");
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    printf("  {\n");
-    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
-      printf("    {");
-      for (b = 0; b < NZC_BITS_EXTRA; ++b) {
-        vp9_prob prob = get_binary_prob(nzc_pcat_stats[c][t][b][0],
-                                        nzc_pcat_stats[c][t][b][1]);
-        printf(" %-3d,", prob);
-      }
-      printf(" },\n");
-    }
-    printf("  },\n");
-  }
-  printf("};\n");
-
-  f = fopen("nzcstats.bin", "wb");
-  fwrite(nzc_stats_4x4, sizeof(nzc_stats_4x4), 1, f);
-  fwrite(nzc_stats_8x8, sizeof(nzc_stats_8x8), 1, f);
-  fwrite(nzc_stats_16x16, sizeof(nzc_stats_16x16), 1, f);
-  fwrite(nzc_stats_32x32, sizeof(nzc_stats_32x32), 1, f);
-  fwrite(nzc_pcat_stats, sizeof(nzc_pcat_stats), 1, f);
-  fclose(f);
-}
-#endif
-
-#endif  // CONFIG_CODE_NONZEROCOUNT
-
-static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
-                          TOKENEXTRA **tok, TOKENEXTRA *tok_end,
-                          int mb_row, int mb_col) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-
-  xd->mode_info_context = m;
-  set_mb_row(&cpi->common, xd, mb_row, (1 << m->mbmi.sb_type));
-  set_mb_col(&cpi->common, xd, mb_col, (1 << m->mbmi.sb_type));
-  if (cm->frame_type == KEY_FRAME) {
-    write_mb_modes_kf(cpi, m, bc,
-                      cm->mb_rows - mb_row, cm->mb_cols - mb_col);
-#ifdef ENTROPY_STATS
-    active_section = 8;
-#endif
-  } else {
-    pack_inter_mode_mvs(cpi, m, bc,
-                        cm->mb_rows - mb_row, cm->mb_cols - mb_col);
-#ifdef ENTROPY_STATS
-    active_section = 1;
-#endif
-  }
-#if CONFIG_CODE_NONZEROCOUNT
-  if (m->mbmi.sb_type == BLOCK_SIZE_SB64X64)
-    write_nzcs_sb64(cpi, xd, mb_row, mb_col, bc);
-  else if (m->mbmi.sb_type == BLOCK_SIZE_SB32X32)
-    write_nzcs_sb32(cpi, xd, mb_row, mb_col, bc);
-  else
-    write_nzcs_mb16(cpi, xd, mb_row, mb_col, bc);
-#endif
-
-  assert(*tok < tok_end);
-  pack_mb_tokens(bc, tok, tok_end);
-}
-
 static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
                         TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
   VP9_COMMON *const c = &cpi->common;
   const int mis = c->mode_info_stride;
   MODE_INFO *m, *m_ptr = c->mi;
-  int i, mb_row, mb_col;
+  int mi_row, mi_col;
 
-  m_ptr += c->cur_tile_mb_col_start + c->cur_tile_mb_row_start * mis;
-  for (mb_row = c->cur_tile_mb_row_start;
-       mb_row < c->cur_tile_mb_row_end; mb_row += 4, m_ptr += 4 * mis) {
-    m = m_ptr;
-    for (mb_col = c->cur_tile_mb_col_start;
-         mb_col < c->cur_tile_mb_col_end; mb_col += 4, m += 4) {
-      vp9_write(bc, m->mbmi.sb_type == BLOCK_SIZE_SB64X64, c->sb64_coded);
-      if (m->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
-        write_modes_b(cpi, m, bc, tok, tok_end, mb_row, mb_col);
-      } else {
-        int j;
+  m_ptr += c->cur_tile_mi_col_start + c->cur_tile_mi_row_start * mis;
+  vpx_memset(c->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
+             mi_cols_aligned_to_sb(c));
 
-        for (j = 0; j < 4; j++) {
-          const int x_idx_sb = (j & 1) << 1, y_idx_sb = j & 2;
-          MODE_INFO *sb_m = m + y_idx_sb * mis + x_idx_sb;
-
-          if (mb_col + x_idx_sb >= c->mb_cols ||
-              mb_row + y_idx_sb >= c->mb_rows)
-            continue;
-
-          vp9_write(bc, sb_m->mbmi.sb_type, c->sb32_coded);
-          if (sb_m->mbmi.sb_type) {
-            assert(sb_m->mbmi.sb_type == BLOCK_SIZE_SB32X32);
-            write_modes_b(cpi, sb_m, bc, tok, tok_end,
-                          mb_row + y_idx_sb, mb_col + x_idx_sb);
-          } else {
-            // Process the 4 MBs in the order:
-            // top-left, top-right, bottom-left, bottom-right
-            for (i = 0; i < 4; i++) {
-              const int x_idx = x_idx_sb + (i & 1), y_idx = y_idx_sb + (i >> 1);
-              MODE_INFO *mb_m = m + x_idx + y_idx * mis;
-
-              if (mb_row + y_idx >= c->mb_rows ||
-                  mb_col + x_idx >= c->mb_cols) {
-                // MB lies outside frame, move on
-                continue;
-              }
-
-              assert(mb_m->mbmi.sb_type == BLOCK_SIZE_MB16X16);
-              write_modes_b(cpi, mb_m, bc, tok, tok_end,
-                            mb_row + y_idx, mb_col + x_idx);
-            }
-          }
-        }
-      }
-    }
+  for (mi_row = c->cur_tile_mi_row_start;
+       mi_row < c->cur_tile_mi_row_end;
+       mi_row += 8, m_ptr += 8 * mis) {
+    m = m_ptr;
+    vpx_memset(c->left_seg_context, 0, sizeof(c->left_seg_context));
+    for (mi_col = c->cur_tile_mi_col_start;
+         mi_col < c->cur_tile_mi_col_end;
+         mi_col += 64 / MI_SIZE, m += 64 / MI_SIZE)
+      write_modes_sb(cpi, m, bc, tok, tok_end, mi_row, mi_col,
+                     BLOCK_SIZE_SB64X64);
   }
 }
 
-
 /* This function is used for debugging probability trees. */
 static void print_prob_tree(vp9_coeff_probs *coef_probs, int block_types) {
   /* print coef probability tree */
@@ -1759,23 +1012,16 @@
   fclose(f);
 }
 
-static void build_tree_distribution(vp9_coeff_probs *coef_probs,
-                                    vp9_coeff_count *coef_counts,
-                                    unsigned int (*eob_branch_ct)[REF_TYPES]
-                                                                 [COEF_BANDS]
-                                                          [PREV_COEF_CONTEXTS],
-#ifdef ENTROPY_STATS
-                                    VP9_COMP *cpi,
-                                    vp9_coeff_accum *context_counters,
-#endif
-                                    vp9_coeff_stats *coef_branch_ct,
-                                    int block_types) {
+static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE txfm_size) {
+  vp9_coeff_probs_model *coef_probs = cpi->frame_coef_probs[txfm_size];
+  vp9_coeff_count *coef_counts = cpi->coef_counts[txfm_size];
+  unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
+      cpi->common.fc.eob_branch_counts[txfm_size];
+  vp9_coeff_stats *coef_branch_ct = cpi->frame_branch_ct[txfm_size];
+  vp9_prob full_probs[ENTROPY_NODES];
   int i, j, k, l;
-#ifdef ENTROPY_STATS
-  int t = 0;
-#endif
 
-  for (i = 0; i < block_types; ++i) {
+  for (i = 0; i < BLOCK_TYPES; ++i) {
     for (j = 0; j < REF_TYPES; ++j) {
       for (k = 0; k < COEF_BANDS; ++k) {
         for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
@@ -1782,19 +1028,31 @@
           if (l >= 3 && k == 0)
             continue;
           vp9_tree_probs_from_distribution(vp9_coef_tree,
-                                           coef_probs[i][j][k][l],
+                                           full_probs,
                                            coef_branch_ct[i][j][k][l],
                                            coef_counts[i][j][k][l], 0);
+          vpx_memcpy(coef_probs[i][j][k][l], full_probs,
+                     sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+#if CONFIG_BALANCED_COEFTREE
+          coef_branch_ct[i][j][k][l][1][1] = eob_branch_ct[i][j][k][l] -
+                                             coef_branch_ct[i][j][k][l][1][0];
+          coef_probs[i][j][k][l][1] =
+              get_binary_prob(coef_branch_ct[i][j][k][l][1][0],
+                              coef_branch_ct[i][j][k][l][1][1]);
+#else
           coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] -
                                              coef_branch_ct[i][j][k][l][0][0];
           coef_probs[i][j][k][l][0] =
               get_binary_prob(coef_branch_ct[i][j][k][l][0][0],
                               coef_branch_ct[i][j][k][l][0][1]);
+#endif
 #ifdef ENTROPY_STATS
           if (!cpi->dummy_packing) {
+            int t;
             for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-              context_counters[i][j][k][l][t] += coef_counts[i][j][k][l][t];
-            context_counters[i][j][k][l][MAX_ENTROPY_TOKENS] +=
+              context_counters[txfm_size][i][j][k][l][t] +=
+                  coef_counts[i][j][k][l][t];
+            context_counters[txfm_size][i][j][k][l][MAX_ENTROPY_TOKENS] +=
                 eob_branch_ct[i][j][k][l];
           }
 #endif
@@ -1805,301 +1063,45 @@
 }
 
 static void build_coeff_contexts(VP9_COMP *cpi) {
-  build_tree_distribution(cpi->frame_coef_probs_4x4,
-                          cpi->coef_counts_4x4,
-                          cpi->common.fc.eob_branch_counts[TX_4X4],
-#ifdef ENTROPY_STATS
-                          cpi, context_counters_4x4,
-#endif
-                          cpi->frame_branch_ct_4x4, BLOCK_TYPES);
-  build_tree_distribution(cpi->frame_coef_probs_8x8,
-                          cpi->coef_counts_8x8,
-                          cpi->common.fc.eob_branch_counts[TX_8X8],
-#ifdef ENTROPY_STATS
-                          cpi, context_counters_8x8,
-#endif
-                          cpi->frame_branch_ct_8x8, BLOCK_TYPES);
-  build_tree_distribution(cpi->frame_coef_probs_16x16,
-                          cpi->coef_counts_16x16,
-                          cpi->common.fc.eob_branch_counts[TX_16X16],
-#ifdef ENTROPY_STATS
-                          cpi, context_counters_16x16,
-#endif
-                          cpi->frame_branch_ct_16x16, BLOCK_TYPES);
-  build_tree_distribution(cpi->frame_coef_probs_32x32,
-                          cpi->coef_counts_32x32,
-                          cpi->common.fc.eob_branch_counts[TX_32X32],
-#ifdef ENTROPY_STATS
-                          cpi, context_counters_32x32,
-#endif
-                          cpi->frame_branch_ct_32x32, BLOCK_TYPES);
+  TX_SIZE t;
+  for (t = TX_4X4; t <= TX_32X32; t++)
+    build_tree_distribution(cpi, t);
 }
 
-#if CONFIG_CODE_NONZEROCOUNT
-static void update_nzc_probs_common(VP9_COMP* cpi,
-                                    vp9_writer* const bc,
-                                    int block_size) {
-  VP9_COMMON *cm = &cpi->common;
-  int c, r, b, t;
-  int update[2] = {0, 0};
-  int savings = 0;
-  int tokens, nodes;
-  const vp9_tree_index *nzc_tree;
-  vp9_prob *new_nzc_probs;
-  vp9_prob *old_nzc_probs;
-  unsigned int *nzc_counts;
-  unsigned int (*nzc_branch_ct)[2];
-  vp9_prob upd;
-
-  if (block_size == 32) {
-    tokens = NZC32X32_TOKENS;
-    nzc_tree = vp9_nzc32x32_tree;
-    old_nzc_probs = cm->fc.nzc_probs_32x32[0][0][0];
-    new_nzc_probs = cpi->frame_nzc_probs_32x32[0][0][0];
-    nzc_counts = cm->fc.nzc_counts_32x32[0][0][0];
-    nzc_branch_ct = cpi->frame_nzc_branch_ct_32x32[0][0][0];
-    upd = NZC_UPDATE_PROB_32X32;
-  } else if (block_size == 16) {
-    tokens = NZC16X16_TOKENS;
-    nzc_tree = vp9_nzc16x16_tree;
-    old_nzc_probs = cm->fc.nzc_probs_16x16[0][0][0];
-    new_nzc_probs = cpi->frame_nzc_probs_16x16[0][0][0];
-    nzc_counts = cm->fc.nzc_counts_16x16[0][0][0];
-    nzc_branch_ct = cpi->frame_nzc_branch_ct_16x16[0][0][0];
-    upd = NZC_UPDATE_PROB_16X16;
-  } else if (block_size == 8) {
-    tokens = NZC8X8_TOKENS;
-    nzc_tree = vp9_nzc8x8_tree;
-    old_nzc_probs = cm->fc.nzc_probs_8x8[0][0][0];
-    new_nzc_probs = cpi->frame_nzc_probs_8x8[0][0][0];
-    nzc_counts = cm->fc.nzc_counts_8x8[0][0][0];
-    nzc_branch_ct = cpi->frame_nzc_branch_ct_8x8[0][0][0];
-    upd = NZC_UPDATE_PROB_8X8;
-  } else {
-    nzc_tree = vp9_nzc4x4_tree;
-    tokens = NZC4X4_TOKENS;
-    old_nzc_probs = cm->fc.nzc_probs_4x4[0][0][0];
-    new_nzc_probs = cpi->frame_nzc_probs_4x4[0][0][0];
-    nzc_counts = cm->fc.nzc_counts_4x4[0][0][0];
-    nzc_branch_ct = cpi->frame_nzc_branch_ct_4x4[0][0][0];
-    upd = NZC_UPDATE_PROB_4X4;
-  }
-  nodes = tokens - 1;
-  // Get the new probabilities and the branch counts
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (r = 0; r < REF_TYPES; ++r) {
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;
-        int offset_nodes = offset * nodes;
-        int offset_tokens = offset * tokens;
-        vp9_tree_probs_from_distribution(nzc_tree,
-                                         new_nzc_probs + offset_nodes,
-                                         nzc_branch_ct + offset_nodes,
-                                         nzc_counts + offset_tokens, 0);
-      }
-    }
-  }
-
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (r = 0; r < REF_TYPES; ++r) {
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;
-        int offset_nodes = offset * nodes;
-        for (t = 0; t < nodes; ++t) {
-          vp9_prob newp = new_nzc_probs[offset_nodes + t];
-          vp9_prob oldp = old_nzc_probs[offset_nodes + t];
-          int s, u = 0;
-#if defined(SEARCH_NEWP)
-            s = prob_diff_update_savings_search(nzc_branch_ct[offset_nodes],
-                                                oldp, &newp, upd);
-            if (s > 0 && newp != oldp)
-              u = 1;
-            if (u)
-              savings += s - (int)(vp9_cost_zero(upd));
-            else
-              savings -= (int)(vp9_cost_zero(upd));
-#else
-          s = prob_update_savings(nzc_branch_ct[offset_nodes],
-                                  oldp, newp, upd);
-          if (s > 0)
-            u = 1;
-          if (u)
-            savings += s;
-#endif
-          update[u]++;
-        }
-      }
-    }
-  }
-  if (update[1] == 0 || savings < 0) {
-    vp9_write_bit(bc, 0);
-  } else {
-    vp9_write_bit(bc, 1);
-    for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-      for (r = 0; r < REF_TYPES; ++r) {
-        for (b = 0; b < BLOCK_TYPES; ++b) {
-          int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;
-          int offset_nodes = offset * nodes;
-          for (t = 0; t < nodes; ++t) {
-            vp9_prob newp = new_nzc_probs[offset_nodes + t];
-            vp9_prob *oldp = &old_nzc_probs[offset_nodes + t];
-            int s, u = 0;
-#if defined(SEARCH_NEWP)
-            s = prob_diff_update_savings_search(nzc_branch_ct[offset_nodes],
-                                                *oldp, &newp, upd);
-            if (s > 0 && newp != *oldp)
-              u = 1;
-#else
-            s = prob_update_savings(nzc_branch_ct[offset_nodes],
-                                    *oldp, newp, upd);
-            if (s > 0)
-              u = 1;
-#endif
-            vp9_write(bc, u, upd);
-            if (u) {
-              /* send/use new probability */
-              write_prob_diff_update(bc, newp, *oldp);
-              *oldp = newp;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-static void update_nzc_pcat_probs(VP9_COMP *cpi, vp9_writer* const bc) {
-  VP9_COMMON *cm = &cpi->common;
-  int c, t, b;
-  int update[2] = {0, 0};
-  int savings = 0;
-  vp9_prob upd = NZC_UPDATE_PROB_PCAT;
-  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
-      int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];
-      for (b = 0; b < bits; ++b) {
-        vp9_prob newp = get_binary_prob(cm->fc.nzc_pcat_counts[c][t][b][0],
-                                        cm->fc.nzc_pcat_counts[c][t][b][1]);
-        vp9_prob oldp = cm->fc.nzc_pcat_probs[c][t][b];
-        int s, u = 0;
-#if defined(SEARCH_NEWP)
-        s = prob_diff_update_savings_search(cm->fc.nzc_pcat_counts[c][t][b],
-                                            oldp, &newp, upd);
-        if (s > 0 && newp != oldp)
-          u = 1;
-        if (u)
-          savings += s - (int)(vp9_cost_zero(upd));
-        else
-          savings -= (int)(vp9_cost_zero(upd));
-#else
-        s = prob_update_savings(cm->fc.nzc_pcat_counts[c][t][b],
-                                oldp, newp, upd);
-        if (s > 0)
-          u = 1;
-        if (u)
-          savings += s;
-#endif
-        update[u]++;
-      }
-    }
-  }
-  if (update[1] == 0 || savings < 0) {
-    vp9_write_bit(bc, 0);
-  } else {
-    vp9_write_bit(bc, 1);
-    for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {
-      for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {
-        int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];
-        for (b = 0; b < bits; ++b) {
-          vp9_prob newp = get_binary_prob(cm->fc.nzc_pcat_counts[c][t][b][0],
-                                          cm->fc.nzc_pcat_counts[c][t][b][1]);
-          vp9_prob *oldp = &cm->fc.nzc_pcat_probs[c][t][b];
-          int s, u = 0;
-#if defined(SEARCH_NEWP)
-          s = prob_diff_update_savings_search(cm->fc.nzc_pcat_counts[c][t][b],
-                                              *oldp, &newp, upd);
-          if (s > 0 && newp != *oldp)
-            u = 1;
-#else
-          s = prob_update_savings(cm->fc.nzc_pcat_counts[c][t][b],
-                                  *oldp, newp, upd);
-          if (s > 0)
-            u = 1;
-#endif
-          vp9_write(bc, u, upd);
-          if (u) {
-            /* send/use new probability */
-            write_prob_diff_update(bc, newp, *oldp);
-            *oldp = newp;
-          }
-        }
-      }
-    }
-  }
-}
-
-static void update_nzc_probs(VP9_COMP* cpi,
-                             vp9_writer* const bc) {
-  update_nzc_probs_common(cpi, bc, 4);
-  if (cpi->common.txfm_mode != ONLY_4X4)
-    update_nzc_probs_common(cpi, bc, 8);
-  if (cpi->common.txfm_mode > ALLOW_8X8)
-    update_nzc_probs_common(cpi, bc, 16);
-  if (cpi->common.txfm_mode > ALLOW_16X16)
-    update_nzc_probs_common(cpi, bc, 32);
-#ifdef NZC_PCAT_UPDATE
-  update_nzc_pcat_probs(cpi, bc);
-#endif
-#ifdef NZC_STATS
-  if (!cpi->dummy_packing)
-    update_nzcstats(&cpi->common);
-#endif
-}
-#endif  // CONFIG_CODE_NONZEROCOUNT
-
-static void update_coef_probs_common(vp9_writer* const bc,
-#ifdef ENTROPY_STATS
-                                     VP9_COMP *cpi,
-                                     vp9_coeff_stats *tree_update_hist,
-#endif
-                                     vp9_coeff_probs *new_frame_coef_probs,
-                                     vp9_coeff_probs *old_frame_coef_probs,
-                                     vp9_coeff_stats *frame_branch_ct,
-                                     int block_types) {
+static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
+                                     TX_SIZE tx_size) {
+  vp9_coeff_probs_model *new_frame_coef_probs = cpi->frame_coef_probs[tx_size];
+  vp9_coeff_probs_model *old_frame_coef_probs =
+      cpi->common.fc.coef_probs[tx_size];
+  vp9_coeff_stats *frame_branch_ct = cpi->frame_branch_ct[tx_size];
   int i, j, k, l, t;
   int update[2] = {0, 0};
   int savings;
-#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE
-  const int entropy_nodes_update = UNCONSTRAINED_UPDATE_NODES;
-#else
-  const int entropy_nodes_update = ENTROPY_NODES;
-#endif
-  // vp9_prob bestupd = find_coef_update_prob(cpi);
 
+  const int entropy_nodes_update = UNCONSTRAINED_NODES;
+
+  const int tstart = 0;
   /* dry run to see if there is any udpate at all needed */
   savings = 0;
-  for (i = 0; i < block_types; ++i) {
+  for (i = 0; i < BLOCK_TYPES; ++i) {
     for (j = 0; j < REF_TYPES; ++j) {
       for (k = 0; k < COEF_BANDS; ++k) {
         // int prev_coef_savings[ENTROPY_NODES] = {0};
         for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
-          for (t = CONFIG_CODE_NONZEROCOUNT; t < entropy_nodes_update; ++t) {
+          for (t = tstart; t < entropy_nodes_update; ++t) {
             vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
             const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t];
             const vp9_prob upd = vp9_coef_update_prob[t];
-            int s;  // = prev_coef_savings[t];
+            int s;
             int u = 0;
 
             if (l >= 3 && k == 0)
               continue;
-#if defined(SEARCH_NEWP)
-#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE
-            if (t == UNCONSTRAINED_NODES - 1)
+            if (t == PIVOT_NODE)
               s = prob_diff_update_savings_search_model(
                   frame_branch_ct[i][j][k][l][0],
                   old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
             else
-#endif
               s = prob_diff_update_savings_search(
                   frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
             if (s > 0 && newp != oldp)
@@ -2108,15 +1110,6 @@
               savings += s - (int)(vp9_cost_zero(upd));
             else
               savings -= (int)(vp9_cost_zero(upd));
-#else
-            s = prob_update_savings(frame_branch_ct[i][j][k][l][t],
-                                    oldp, newp, upd);
-            if (s > 0)
-              u = 1;
-            if (u)
-              savings += s;
-#endif
-
             update[u]++;
           }
         }
@@ -2131,54 +1124,39 @@
     return;
   }
   vp9_write_bit(bc, 1);
-  for (i = 0; i < block_types; ++i) {
+  for (i = 0; i < BLOCK_TYPES; ++i) {
     for (j = 0; j < REF_TYPES; ++j) {
       for (k = 0; k < COEF_BANDS; ++k) {
         // int prev_coef_savings[ENTROPY_NODES] = {0};
         for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
           // calc probs and branch cts for this frame only
-          for (t = CONFIG_CODE_NONZEROCOUNT; t < entropy_nodes_update; ++t) {
+          for (t = tstart; t < entropy_nodes_update; ++t) {
             vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
             vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
             const vp9_prob upd = vp9_coef_update_prob[t];
-            int s;  // = prev_coef_savings[t];
+            int s;
             int u = 0;
             if (l >= 3 && k == 0)
               continue;
-
-#if defined(SEARCH_NEWP)
-#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE
-            if (t == UNCONSTRAINED_NODES - 1)
+            if (t == PIVOT_NODE)
               s = prob_diff_update_savings_search_model(
                   frame_branch_ct[i][j][k][l][0],
                   old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
             else
-#endif
               s = prob_diff_update_savings_search(
                   frame_branch_ct[i][j][k][l][t],
                   *oldp, &newp, upd);
             if (s > 0 && newp != *oldp)
               u = 1;
-#else
-            s = prob_update_savings(frame_branch_ct[i][j][k][l][t],
-                                    *oldp, newp, upd);
-            if (s > 0)
-              u = 1;
-#endif
             vp9_write(bc, u, upd);
 #ifdef ENTROPY_STATS
             if (!cpi->dummy_packing)
-              ++tree_update_hist[i][j][k][l][t][u];
+              ++tree_update_hist[tx_size][i][j][k][l][t][u];
 #endif
             if (u) {
               /* send/use new probability */
               write_prob_diff_update(bc, newp, *oldp);
               *oldp = newp;
-#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE
-              if (t == UNCONSTRAINED_NODES - 1)
-                vp9_get_model_distribution(
-                    newp, old_frame_coef_probs[i][j][k][l], i, j);
-#endif
             }
           }
         }
@@ -2188,738 +1166,565 @@
 }
 
 static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {
+  const TXFM_MODE txfm_mode = cpi->common.txfm_mode;
+
   vp9_clear_system_state();
 
   // Build the cofficient contexts based on counts collected in encode loop
   build_coeff_contexts(cpi);
 
-  update_coef_probs_common(bc,
-#ifdef ENTROPY_STATS
-                           cpi,
-                           tree_update_hist_4x4,
-#endif
-                           cpi->frame_coef_probs_4x4,
-                           cpi->common.fc.coef_probs_4x4,
-                           cpi->frame_branch_ct_4x4,
-                           BLOCK_TYPES);
+  update_coef_probs_common(bc, cpi, TX_4X4);
 
-  /* do not do this if not even allowed */
-  if (cpi->common.txfm_mode != ONLY_4X4) {
-    update_coef_probs_common(bc,
-#ifdef ENTROPY_STATS
-                             cpi,
-                             tree_update_hist_8x8,
-#endif
-                             cpi->frame_coef_probs_8x8,
-                             cpi->common.fc.coef_probs_8x8,
-                             cpi->frame_branch_ct_8x8,
-                             BLOCK_TYPES);
-  }
+  // do not do this if not even allowed
+  if (txfm_mode > ONLY_4X4)
+    update_coef_probs_common(bc, cpi, TX_8X8);
 
-  if (cpi->common.txfm_mode > ALLOW_8X8) {
-    update_coef_probs_common(bc,
-#ifdef ENTROPY_STATS
-                             cpi,
-                             tree_update_hist_16x16,
-#endif
-                             cpi->frame_coef_probs_16x16,
-                             cpi->common.fc.coef_probs_16x16,
-                             cpi->frame_branch_ct_16x16,
-                             BLOCK_TYPES);
-  }
+  if (txfm_mode > ALLOW_8X8)
+    update_coef_probs_common(bc, cpi, TX_16X16);
 
-  if (cpi->common.txfm_mode > ALLOW_16X16) {
-    update_coef_probs_common(bc,
-#ifdef ENTROPY_STATS
-                             cpi,
-                             tree_update_hist_32x32,
-#endif
-                             cpi->frame_coef_probs_32x32,
-                             cpi->common.fc.coef_probs_32x32,
-                             cpi->frame_branch_ct_32x32,
-                             BLOCK_TYPES);
-  }
+  if (txfm_mode > ALLOW_16X16)
+    update_coef_probs_common(bc, cpi, TX_32X32);
 }
 
-#ifdef PACKET_TESTING
-FILE *vpxlogc = 0;
-#endif
+static void encode_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd,
+                              struct vp9_write_bit_buffer *wb) {
+  int i;
 
-static void put_delta_q(vp9_writer *bc, int delta_q) {
-  if (delta_q != 0) {
-    vp9_write_bit(bc, 1);
-    vp9_write_literal(bc, abs(delta_q), 4);
+  // Encode the loop filter level and type
+  vp9_wb_write_literal(wb, pc->filter_level, 6);
+  vp9_wb_write_literal(wb, pc->sharpness_level, 3);
 
-    if (delta_q < 0)
-      vp9_write_bit(bc, 1);
-    else
-      vp9_write_bit(bc, 0);
-  } else
-    vp9_write_bit(bc, 0);
-}
+  // Write out loop filter deltas applied at the MB level based on mode or
+  // ref frame (if they are enabled).
+  vp9_wb_write_bit(wb, xd->mode_ref_lf_delta_enabled);
 
-static void decide_kf_ymode_entropy(VP9_COMP *cpi) {
+  if (xd->mode_ref_lf_delta_enabled) {
+    // Do the deltas need to be updated
+    vp9_wb_write_bit(wb, xd->mode_ref_lf_delta_update);
+    if (xd->mode_ref_lf_delta_update) {
+      // Send update
+      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
+        const int delta = xd->ref_lf_deltas[i];
 
-  int mode_cost[MB_MODE_COUNT];
-  int cost;
-  int bestcost = INT_MAX;
-  int bestindex = 0;
-  int i, j;
+        // Frame level data
+        if (delta != xd->last_ref_lf_deltas[i]) {
+          xd->last_ref_lf_deltas[i] = delta;
+          vp9_wb_write_bit(wb, 1);
 
-  for (i = 0; i < 8; i++) {
-    vp9_cost_tokens(mode_cost, cpi->common.kf_ymode_prob[i], vp9_kf_ymode_tree);
-    cost = 0;
-    for (j = 0; j < VP9_YMODES; j++) {
-      cost += mode_cost[j] * cpi->ymode_count[j];
+          assert(delta != 0);
+          vp9_wb_write_literal(wb, abs(delta) & 0x3F, 6);
+          vp9_wb_write_bit(wb, delta < 0);
+        } else {
+          vp9_wb_write_bit(wb, 0);
+        }
+      }
+
+      // Send update
+      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+        const int delta = xd->mode_lf_deltas[i];
+        if (delta != xd->last_mode_lf_deltas[i]) {
+          xd->last_mode_lf_deltas[i] = delta;
+          vp9_wb_write_bit(wb, 1);
+
+          assert(delta != 0);
+          vp9_wb_write_literal(wb, abs(delta) & 0x3F, 6);
+          vp9_wb_write_bit(wb, delta < 0);
+        } else {
+          vp9_wb_write_bit(wb, 0);
+        }
+      }
     }
-    vp9_cost_tokens(mode_cost, cpi->common.sb_kf_ymode_prob[i],
-                    vp9_sb_ymode_tree);
-    for (j = 0; j < VP9_I32X32_MODES; j++) {
-      cost += mode_cost[j] * cpi->sb_ymode_count[j];
-    }
-    if (cost < bestcost) {
-      bestindex = i;
-      bestcost = cost;
-    }
   }
-  cpi->common.kf_ymode_probs_index = bestindex;
-
 }
-static void segment_reference_frames(VP9_COMP *cpi) {
-  VP9_COMMON *oci = &cpi->common;
-  MODE_INFO *mi = oci->mi;
-  int ref[MAX_MB_SEGMENTS] = {0};
-  int i, j;
-  int mb_index = 0;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
-  for (i = 0; i < oci->mb_rows; i++) {
-    for (j = 0; j < oci->mb_cols; j++, mb_index++) {
-      ref[mi[mb_index].mbmi.segment_id] |= (1 << mi[mb_index].mbmi.ref_frame);
-    }
-    mb_index++;
+static void write_delta_q(struct vp9_write_bit_buffer *wb, int delta_q) {
+  if (delta_q != 0) {
+    vp9_wb_write_bit(wb, 1);
+    vp9_wb_write_literal(wb, abs(delta_q), 4);
+    vp9_wb_write_bit(wb, delta_q < 0);
+  } else {
+    vp9_wb_write_bit(wb, 0);
   }
-  for (i = 0; i < MAX_MB_SEGMENTS; i++) {
-    vp9_enable_segfeature(xd, i, SEG_LVL_REF_FRAME);
-    vp9_set_segdata(xd, i, SEG_LVL_REF_FRAME, ref[i]);
-  }
 }
 
-void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
-                        unsigned long *size) {
+static void encode_quantization(VP9_COMMON *cm,
+                                struct vp9_write_bit_buffer *wb) {
+  vp9_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS);
+  write_delta_q(wb, cm->y_dc_delta_q);
+  write_delta_q(wb, cm->uv_dc_delta_q);
+  write_delta_q(wb, cm->uv_ac_delta_q);
+}
+
+
+static void encode_segmentation(VP9_COMP *cpi,
+                               struct vp9_write_bit_buffer *wb) {
   int i, j;
-  VP9_HEADER oh;
-  VP9_COMMON *const pc = &cpi->common;
-  vp9_writer header_bc, residual_bc;
+  VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  int extra_bytes_packed = 0;
 
-  unsigned char *cx_data = dest;
+  vp9_wb_write_bit(wb, xd->segmentation_enabled);
+  if (!xd->segmentation_enabled)
+    return;
 
-  oh.show_frame = (int) pc->show_frame;
-  oh.type = (int)pc->frame_type;
-  oh.version = pc->version;
-  oh.first_partition_length_in_bytes = 0;
-
-  cx_data += 3;
-
-#if defined(SECTIONBITS_OUTPUT)
-  Sectionbits[active_section = 1] += sizeof(VP9_HEADER) * 8 * 256;
-#endif
-
-  compute_update_table();
-
-  /* vp9_kf_default_bmode_probs() is called in vp9_setup_key_frame() once
-   * for each K frame before encode frame. pc->kf_bmode_prob doesn't get
-   * changed anywhere else. No need to call it again here. --yw
-   * vp9_kf_default_bmode_probs( pc->kf_bmode_prob);
-   */
-
-  /* every keyframe send startcode, width, height, scale factor, clamp
-   * and color type.
-   */
-  if (oh.type == KEY_FRAME) {
-    // Start / synch code
-    cx_data[0] = 0x9D;
-    cx_data[1] = 0x01;
-    cx_data[2] = 0x2a;
-    extra_bytes_packed = 3;
-    cx_data += extra_bytes_packed;
-  }
-  {
-    int v;
-
-    if (pc->width != pc->display_width || pc->height != pc->display_height) {
-      v = pc->display_width;
-      cx_data[0] = v;
-      cx_data[1] = v >> 8;
-
-      v = pc->display_height;
-      cx_data[2] = v;
-      cx_data[3] = v >> 8;
-      cx_data += 4;
-      extra_bytes_packed += 4;
+  // Segmentation map
+  vp9_wb_write_bit(wb, xd->update_mb_segmentation_map);
+  if (xd->update_mb_segmentation_map) {
+    // Select the coding strategy (temporal or spatial)
+    vp9_choose_segmap_coding_method(cpi);
+    // Write out probabilities used to decode unpredicted  macro-block segments
+    for (i = 0; i < MB_SEG_TREE_PROBS; i++) {
+      const int prob = xd->mb_segment_tree_probs[i];
+      const int update = prob != MAX_PROB;
+      vp9_wb_write_bit(wb, update);
+      if (update)
+        vp9_wb_write_literal(wb, prob, 8);
     }
 
-    v = pc->width;
-    cx_data[0] = v;
-    cx_data[1] = v >> 8;
-
-    v = pc->height;
-    cx_data[2] = v;
-    cx_data[3] = v >> 8;
-
-    extra_bytes_packed += 4;
-    cx_data += 4;
+    // Write out the chosen coding method.
+    vp9_wb_write_bit(wb, cm->temporal_update);
+    if (cm->temporal_update) {
+      for (i = 0; i < PREDICTION_PROBS; i++) {
+        const int prob = cm->segment_pred_probs[i];
+        const int update = prob != MAX_PROB;
+        vp9_wb_write_bit(wb, update);
+        if (update)
+          vp9_wb_write_literal(wb, prob, 8);
+      }
+    }
   }
 
-  vp9_start_encode(&header_bc, cx_data);
+  // Segmentation data
+  vp9_wb_write_bit(wb, xd->update_mb_segmentation_data);
+  if (xd->update_mb_segmentation_data) {
+    vp9_wb_write_bit(wb, xd->mb_segment_abs_delta);
 
-  // TODO(jkoleszar): remove these two unused bits?
-  vp9_write_bit(&header_bc, pc->clr_type);
-  vp9_write_bit(&header_bc, pc->clamp_type);
+    for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+      for (j = 0; j < SEG_LVL_MAX; j++) {
+        const int active = vp9_segfeature_active(xd, i, j);
+        vp9_wb_write_bit(wb, active);
+        if (active) {
+          const int data = vp9_get_segdata(xd, i, j);
+          const int data_max = vp9_seg_feature_data_max(j);
 
-  // error resilient mode
-  vp9_write_bit(&header_bc, pc->error_resilient_mode);
-
-  // Signal whether or not Segmentation is enabled
-  vp9_write_bit(&header_bc, (xd->segmentation_enabled) ? 1 : 0);
-
-  // Indicate which features are enabled
-  if (xd->segmentation_enabled) {
-    // Indicate whether or not the segmentation map is being updated.
-    vp9_write_bit(&header_bc, (xd->update_mb_segmentation_map) ? 1 : 0);
-
-    // If it is, then indicate the method that will be used.
-    if (xd->update_mb_segmentation_map) {
-      // Select the coding strategy (temporal or spatial)
-      vp9_choose_segmap_coding_method(cpi);
-      // Send the tree probabilities used to decode unpredicted
-      // macro-block segments
-      for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
-        int data = xd->mb_segment_tree_probs[i];
-
-        if (data != 255) {
-          vp9_write_bit(&header_bc, 1);
-          vp9_write_literal(&header_bc, data, 8);
-        } else {
-          vp9_write_bit(&header_bc, 0);
-        }
-      }
-
-      // Write out the chosen coding method.
-      vp9_write_bit(&header_bc, (pc->temporal_update) ? 1 : 0);
-      if (pc->temporal_update) {
-        for (i = 0; i < PREDICTION_PROBS; i++) {
-          int data = pc->segment_pred_probs[i];
-
-          if (data != 255) {
-            vp9_write_bit(&header_bc, 1);
-            vp9_write_literal(&header_bc, data, 8);
+          if (vp9_is_segfeature_signed(j)) {
+            vp9_encode_unsigned_max(wb, abs(data), data_max);
+            vp9_wb_write_bit(wb, data < 0);
           } else {
-            vp9_write_bit(&header_bc, 0);
+            vp9_encode_unsigned_max(wb, data, data_max);
           }
         }
       }
     }
+  }
+}
 
-    vp9_write_bit(&header_bc, (xd->update_mb_segmentation_data) ? 1 : 0);
 
-    // segment_reference_frames(cpi);
+static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
+  VP9_COMMON *const cm = &cpi->common;
 
-    if (xd->update_mb_segmentation_data) {
-      signed char Data;
+  // Mode
+  vp9_write_literal(w, MIN(cm->txfm_mode, ALLOW_32X32), 2);
+  if (cm->txfm_mode >= ALLOW_32X32)
+    vp9_write_bit(w, cm->txfm_mode == TX_MODE_SELECT);
 
-      vp9_write_bit(&header_bc, (xd->mb_segment_abs_delta) ? 1 : 0);
+  // Probabilities
+  if (cm->txfm_mode == TX_MODE_SELECT) {
+    int i, j;
+    unsigned int ct_8x8p[TX_SIZE_MAX_SB - 3][2];
+    unsigned int ct_16x16p[TX_SIZE_MAX_SB - 2][2];
+    unsigned int ct_32x32p[TX_SIZE_MAX_SB - 1][2];
 
-      // For each segments id...
-      for (i = 0; i < MAX_MB_SEGMENTS; i++) {
-        // For each segmentation codable feature...
-        for (j = 0; j < SEG_LVL_MAX; j++) {
-          Data = vp9_get_segdata(xd, i, j);
 
-          // If the feature is enabled...
-          if (vp9_segfeature_active(xd, i, j)) {
-            vp9_write_bit(&header_bc, 1);
-
-            // Is the segment data signed..
-            if (vp9_is_segfeature_signed(j)) {
-              // Encode the relevant feature data
-              if (Data < 0) {
-                Data = - Data;
-                vp9_encode_unsigned_max(&header_bc, Data,
-                                        vp9_seg_feature_data_max(j));
-                vp9_write_bit(&header_bc, 1);
-              } else {
-                vp9_encode_unsigned_max(&header_bc, Data,
-                                        vp9_seg_feature_data_max(j));
-                vp9_write_bit(&header_bc, 0);
-              }
-            }
-            // Unsigned data element so no sign bit needed
-            else
-              vp9_encode_unsigned_max(&header_bc, Data,
-                                      vp9_seg_feature_data_max(j));
-          } else
-            vp9_write_bit(&header_bc, 0);
-        }
+    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+      tx_counts_to_branch_counts_8x8(cm->fc.tx_count_8x8p[i],
+                                     ct_8x8p);
+      for (j = 0; j < TX_SIZE_MAX_SB - 3; j++) {
+        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs_8x8p[i][j],
+                                  VP9_MODE_UPDATE_PROB, ct_8x8p[j]);
       }
     }
-  }
-
-  // Encode the common prediction model status flag probability updates for
-  // the reference frame
-  update_refpred_stats(cpi);
-  if (pc->frame_type != KEY_FRAME) {
-    for (i = 0; i < PREDICTION_PROBS; i++) {
-      if (cpi->ref_pred_probs_update[i]) {
-        vp9_write_bit(&header_bc, 1);
-        vp9_write_literal(&header_bc, pc->ref_pred_probs[i], 8);
-      } else {
-        vp9_write_bit(&header_bc, 0);
+    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+      tx_counts_to_branch_counts_16x16(cm->fc.tx_count_16x16p[i],
+                                       ct_16x16p);
+      for (j = 0; j < TX_SIZE_MAX_SB - 2; j++) {
+        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs_16x16p[i][j],
+                                  VP9_MODE_UPDATE_PROB, ct_16x16p[j]);
       }
     }
+    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+      tx_counts_to_branch_counts_32x32(cm->fc.tx_count_32x32p[i],
+                                       ct_32x32p);
+      for (j = 0; j < TX_SIZE_MAX_SB - 1; j++) {
+        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs_32x32p[i][j],
+                                  VP9_MODE_UPDATE_PROB, ct_32x32p[j]);
+      }
+    }
+#ifdef MODE_STATS
+    if (!cpi->dummy_packing)
+      update_tx_count_stats(cm);
+#endif
   }
+}
 
-  pc->sb64_coded = get_binary_prob(cpi->sb64_count[0], cpi->sb64_count[1]);
-  vp9_write_literal(&header_bc, pc->sb64_coded, 8);
-  pc->sb32_coded = get_binary_prob(cpi->sb32_count[0], cpi->sb32_count[1]);
-  vp9_write_literal(&header_bc, pc->sb32_coded, 8);
+static void write_interp_filter_type(INTERPOLATIONFILTERTYPE type,
+                                     struct vp9_write_bit_buffer *wb) {
+  vp9_wb_write_bit(wb, type == SWITCHABLE);
+  if (type != SWITCHABLE)
+    vp9_wb_write_literal(wb, type, 2);
+}
 
-  vp9_write_bit(&header_bc, cpi->mb.e_mbd.lossless);
-  if (cpi->mb.e_mbd.lossless) {
-    pc->txfm_mode = ONLY_4X4;
-  } else {
-    if (pc->txfm_mode == TX_MODE_SELECT) {
-      pc->prob_tx[0] = get_prob(cpi->txfm_count_32x32p[TX_4X4] +
-                                cpi->txfm_count_16x16p[TX_4X4] +
-                                cpi->txfm_count_8x8p[TX_4X4],
-                                cpi->txfm_count_32x32p[TX_4X4] +
-                                cpi->txfm_count_32x32p[TX_8X8] +
-                                cpi->txfm_count_32x32p[TX_16X16] +
-                                cpi->txfm_count_32x32p[TX_32X32] +
-                                cpi->txfm_count_16x16p[TX_4X4] +
-                                cpi->txfm_count_16x16p[TX_8X8] +
-                                cpi->txfm_count_16x16p[TX_16X16] +
-                                cpi->txfm_count_8x8p[TX_4X4] +
-                                cpi->txfm_count_8x8p[TX_8X8]);
-      pc->prob_tx[1] = get_prob(cpi->txfm_count_32x32p[TX_8X8] +
-                                cpi->txfm_count_16x16p[TX_8X8],
-                                cpi->txfm_count_32x32p[TX_8X8] +
-                                cpi->txfm_count_32x32p[TX_16X16] +
-                                cpi->txfm_count_32x32p[TX_32X32] +
-                                cpi->txfm_count_16x16p[TX_8X8] +
-                                cpi->txfm_count_16x16p[TX_16X16]);
-      pc->prob_tx[2] = get_prob(cpi->txfm_count_32x32p[TX_16X16],
-                                cpi->txfm_count_32x32p[TX_16X16] +
-                                cpi->txfm_count_32x32p[TX_32X32]);
-    } else {
-      pc->prob_tx[0] = 128;
-      pc->prob_tx[1] = 128;
-      pc->prob_tx[2] = 128;
+static void fix_mcomp_filter_type(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  if (cm->mcomp_filter_type == SWITCHABLE) {
+    // Check to see if only one of the filters is actually used
+    int count[VP9_SWITCHABLE_FILTERS];
+    int i, j, c = 0;
+    for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
+      count[i] = 0;
+      for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j)
+        count[i] += cm->fc.switchable_interp_count[j][i];
+      c += (count[i] > 0);
     }
-    vp9_write_literal(&header_bc, pc->txfm_mode <= 3 ? pc->txfm_mode : 3, 2);
-    if (pc->txfm_mode > ALLOW_16X16) {
-      vp9_write_bit(&header_bc, pc->txfm_mode == TX_MODE_SELECT);
+    if (c == 1) {
+      // Only one filter is used. So set the filter at frame level
+      for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
+        if (count[i]) {
+          cm->mcomp_filter_type = vp9_switchable_interp[i];
+          break;
+        }
+      }
     }
-    if (pc->txfm_mode == TX_MODE_SELECT) {
-      vp9_write_literal(&header_bc, pc->prob_tx[0], 8);
-      vp9_write_literal(&header_bc, pc->prob_tx[1], 8);
-      vp9_write_literal(&header_bc, pc->prob_tx[2], 8);
-    }
   }
+}
 
-  // Encode the loop filter level and type
-  vp9_write_bit(&header_bc, pc->filter_type);
-  vp9_write_literal(&header_bc, pc->filter_level, 6);
-  vp9_write_literal(&header_bc, pc->sharpness_level, 3);
-#if CONFIG_LOOP_DERING
-  if (pc->dering_enabled) {
-    vp9_write_bit(&header_bc, 1);
-    vp9_write_literal(&header_bc, pc->dering_enabled - 1, 4);
-  } else {
-    vp9_write_bit(&header_bc, 0);
+static void write_tile_info(VP9_COMMON *cm, struct vp9_write_bit_buffer *wb) {
+  int min_log2_tiles, delta_log2_tiles, n_tile_bits, n;
+  vp9_get_tile_n_bits(cm, &min_log2_tiles, &delta_log2_tiles);
+  n_tile_bits = cm->log2_tile_columns - min_log2_tiles;
+  for (n = 0; n < delta_log2_tiles; n++) {
+    if (n_tile_bits--) {
+      vp9_wb_write_bit(wb, 1);
+    } else {
+      vp9_wb_write_bit(wb, 0);
+      break;
+    }
   }
-#endif
 
-  // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled).
-  vp9_write_bit(&header_bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0);
+  vp9_wb_write_bit(wb, cm->log2_tile_rows != 0);
+  if (cm->log2_tile_rows != 0)
+    vp9_wb_write_bit(wb, cm->log2_tile_rows != 1);
+}
 
-  if (xd->mode_ref_lf_delta_enabled) {
-    // Do the deltas need to be updated
-    int send_update = xd->mode_ref_lf_delta_update;
+static int get_refresh_mask(VP9_COMP *cpi) {
+    // Should the GF or ARF be updated using the transmitted frame or buffer
+#if CONFIG_MULTIPLE_ARF
+    if (!cpi->multi_arf_enabled && cpi->refresh_golden_frame &&
+        !cpi->refresh_alt_ref_frame) {
+#else
+    if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {
+#endif
+      // Preserve the previously existing golden frame and update the frame in
+      // the alt ref slot instead. This is highly specific to the use of
+      // alt-ref as a forward reference, and this needs to be generalized as
+      // other uses are implemented (like RTC/temporal scaling)
+      //
+      // gld_fb_idx and alt_fb_idx need to be swapped for future frames, but
+      // that happens in vp9_onyx_if.c:update_reference_frames() so that it can
+      // be done outside of the recode loop.
+      return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
+             (cpi->refresh_golden_frame << cpi->alt_fb_idx);
+    } else {
+      int arf_idx = cpi->alt_fb_idx;
+#if CONFIG_MULTIPLE_ARF
+      // Determine which ARF buffer to use to encode this ARF frame.
+      if (cpi->multi_arf_enabled) {
+        int sn = cpi->sequence_number;
+        arf_idx = (cpi->frame_coding_order[sn] < 0) ?
+            cpi->arf_buffer_idx[sn + 1] :
+            cpi->arf_buffer_idx[sn];
+      }
+#endif
+      return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
+             (cpi->refresh_golden_frame << cpi->gld_fb_idx) |
+             (cpi->refresh_alt_ref_frame << arf_idx);
+    }
+}
 
-    vp9_write_bit(&header_bc, send_update);
-    if (send_update) {
-      int Data;
+static void write_display_size(VP9_COMP *cpi, struct vp9_write_bit_buffer *wb) {
+  VP9_COMMON *const cm = &cpi->common;
 
-      // Send update
-      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
-        Data = xd->ref_lf_deltas[i];
+  const int scaling_active = cm->width != cm->display_width ||
+                             cm->height != cm->display_height;
+  vp9_wb_write_bit(wb, scaling_active);
+  if (scaling_active) {
+    vp9_wb_write_literal(wb, cm->display_width - 1, 16);
+    vp9_wb_write_literal(wb, cm->display_height - 1, 16);
+  }
+}
 
-        // Frame level data
-        if (xd->ref_lf_deltas[i] != xd->last_ref_lf_deltas[i]) {
-          xd->last_ref_lf_deltas[i] = xd->ref_lf_deltas[i];
-          vp9_write_bit(&header_bc, 1);
+static void write_frame_size(VP9_COMP *cpi,
+                             struct vp9_write_bit_buffer *wb) {
+  VP9_COMMON *const cm = &cpi->common;
+  vp9_wb_write_literal(wb, cm->width - 1, 16);
+  vp9_wb_write_literal(wb, cm->height - 1, 16);
 
-          if (Data > 0) {
-            vp9_write_literal(&header_bc, (Data & 0x3F), 6);
-            vp9_write_bit(&header_bc, 0);    // sign
-          } else {
-            Data = -Data;
-            vp9_write_literal(&header_bc, (Data & 0x3F), 6);
-            vp9_write_bit(&header_bc, 1);    // sign
-          }
-        } else {
-          vp9_write_bit(&header_bc, 0);
-        }
-      }
+  write_display_size(cpi, wb);
+}
 
-      // Send update
-      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
-        Data = xd->mode_lf_deltas[i];
+static void write_frame_size_with_refs(VP9_COMP *cpi,
+                                       struct vp9_write_bit_buffer *wb) {
+  VP9_COMMON *const cm = &cpi->common;
+  int refs[ALLOWED_REFS_PER_FRAME] = {cpi->lst_fb_idx, cpi->gld_fb_idx,
+                                      cpi->alt_fb_idx};
+  int i, found = 0;
 
-        if (xd->mode_lf_deltas[i] != xd->last_mode_lf_deltas[i]) {
-          xd->last_mode_lf_deltas[i] = xd->mode_lf_deltas[i];
-          vp9_write_bit(&header_bc, 1);
+  for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
+    YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->ref_frame_map[refs[i]]];
+    found = cm->width == cfg->y_crop_width &&
+            cm->height == cfg->y_crop_height;
+    vp9_wb_write_bit(wb, found);
+    if (found)
+      break;
+  }
 
-          if (Data > 0) {
-            vp9_write_literal(&header_bc, (Data & 0x3F), 6);
-            vp9_write_bit(&header_bc, 0);    // sign
-          } else {
-            Data = -Data;
-            vp9_write_literal(&header_bc, (Data & 0x3F), 6);
-            vp9_write_bit(&header_bc, 1);    // sign
-          }
-        } else {
-          vp9_write_bit(&header_bc, 0);
-        }
-      }
-    }
+  if (!found) {
+    vp9_wb_write_literal(wb, cm->width - 1, 16);
+    vp9_wb_write_literal(wb, cm->height - 1, 16);
   }
 
-  // signal here is multi token partition is enabled
-  // vp9_write_literal(&header_bc, pc->multi_token_partition, 2);
-  vp9_write_literal(&header_bc, 0, 2);
+  write_display_size(cpi, wb);
+}
 
-  // Frame Q baseline quantizer index
-  vp9_write_literal(&header_bc, pc->base_qindex, QINDEX_BITS);
+static void write_sync_code(struct vp9_write_bit_buffer *wb) {
+  vp9_wb_write_literal(wb, SYNC_CODE_0, 8);
+  vp9_wb_write_literal(wb, SYNC_CODE_1, 8);
+  vp9_wb_write_literal(wb, SYNC_CODE_2, 8);
+}
 
-  // Transmit Dc, Second order and Uv quantizer delta information
-  put_delta_q(&header_bc, pc->y1dc_delta_q);
-  put_delta_q(&header_bc, pc->uvdc_delta_q);
-  put_delta_q(&header_bc, pc->uvac_delta_q);
+static void write_uncompressed_header(VP9_COMP *cpi,
+                                      struct vp9_write_bit_buffer *wb) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
-  // When there is a key frame all reference buffers are updated using the new key frame
-  if (pc->frame_type != KEY_FRAME) {
-    int refresh_mask;
+  // frame marker bits
+  vp9_wb_write_literal(wb, 0x2, 2);
 
-    // Should the GF or ARF be updated using the transmitted frame or buffer
-    if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {
-      /* Preserve the previously existing golden frame and update the frame in
-       * the alt ref slot instead. This is highly specific to the use of
-       * alt-ref as a forward reference, and this needs to be generalized as
-       * other uses are implemented (like RTC/temporal scaling)
-       *
-       * gld_fb_idx and alt_fb_idx need to be swapped for future frames, but
-       * that happens in vp9_onyx_if.c:update_reference_frames() so that it can
-       * be done outside of the recode loop.
-       */
-      refresh_mask = (cpi->refresh_last_frame << cpi->lst_fb_idx) |
-                     (cpi->refresh_golden_frame << cpi->alt_fb_idx);
+  // bitstream version.
+  // 00 - profile 0. 4:2:0 only
+  // 10 - profile 1. adds 4:4:4, 4:2:2, alpha
+  vp9_wb_write_bit(wb, cm->version);
+  vp9_wb_write_bit(wb, 0);
+
+  vp9_wb_write_bit(wb, 0);
+  vp9_wb_write_bit(wb, cm->frame_type);
+  vp9_wb_write_bit(wb, cm->show_frame);
+  vp9_wb_write_bit(wb, cm->error_resilient_mode);
+
+  if (cm->frame_type == KEY_FRAME) {
+    write_sync_code(wb);
+    // colorspaces
+    // 000 - Unknown
+    // 001 - BT.601
+    // 010 - BT.709
+    // 011 - SMPTE-170
+    // 100 - SMPTE-240
+    // 101 - Reserved
+    // 110 - Reserved
+    // 111 - sRGB (RGB)
+    vp9_wb_write_literal(wb, 0, 3);
+    if (1 /* colorspace != sRGB */) {
+      vp9_wb_write_bit(wb, 0);  // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+      if (cm->version == 1) {
+        vp9_wb_write_bit(wb, cm->subsampling_x);
+        vp9_wb_write_bit(wb, cm->subsampling_y);
+        vp9_wb_write_bit(wb, 0);  // has extra plane
+      }
     } else {
-      refresh_mask = (cpi->refresh_last_frame << cpi->lst_fb_idx) |
-                     (cpi->refresh_golden_frame << cpi->gld_fb_idx) |
-                     (cpi->refresh_alt_ref_frame << cpi->alt_fb_idx);
+      assert(cm->version == 1);
+      vp9_wb_write_bit(wb, 0);  // has extra plane
     }
-    vp9_write_literal(&header_bc, refresh_mask, NUM_REF_FRAMES);
-    vp9_write_literal(&header_bc, cpi->lst_fb_idx, NUM_REF_FRAMES_LG2);
-    vp9_write_literal(&header_bc, cpi->gld_fb_idx, NUM_REF_FRAMES_LG2);
-    vp9_write_literal(&header_bc, cpi->alt_fb_idx, NUM_REF_FRAMES_LG2);
 
-    // Indicate reference frame sign bias for Golden and ARF frames (always 0 for last frame buffer)
-    vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);
-    vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);
+    write_frame_size(cpi, wb);
+  } else {
+    const int refs[ALLOWED_REFS_PER_FRAME] = {cpi->lst_fb_idx, cpi->gld_fb_idx,
+                                              cpi->alt_fb_idx};
+    if (!cm->show_frame)
+      vp9_wb_write_bit(wb, cm->intra_only);
 
-    // Signal whether to allow high MV precision
-    vp9_write_bit(&header_bc, (xd->allow_high_precision_mv) ? 1 : 0);
-    if (pc->mcomp_filter_type == SWITCHABLE) {
-      /* Check to see if only one of the filters is actually used */
-      int count[VP9_SWITCHABLE_FILTERS];
-      int i, j, c = 0;
-      for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
-        count[i] = 0;
-        for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
-          count[i] += cpi->switchable_interp_count[j][i];
-        }
-        c += (count[i] > 0);
+    if (!cm->error_resilient_mode)
+      vp9_wb_write_literal(wb, cm->reset_frame_context, 2);
+
+    if (cm->intra_only) {
+      write_sync_code(wb);
+
+      vp9_wb_write_literal(wb, get_refresh_mask(cpi), NUM_REF_FRAMES);
+      write_frame_size(cpi, wb);
+    } else {
+      int i;
+      vp9_wb_write_literal(wb, get_refresh_mask(cpi), NUM_REF_FRAMES);
+      for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
+        vp9_wb_write_literal(wb, refs[i], NUM_REF_FRAMES_LG2);
+        vp9_wb_write_bit(wb, cm->ref_frame_sign_bias[LAST_FRAME + i]);
       }
-      if (c == 1) {
-        /* Only one filter is used. So set the filter at frame level */
-        for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
-          if (count[i]) {
-            pc->mcomp_filter_type = vp9_switchable_interp[i];
-            break;
-          }
-        }
-      }
+
+      write_frame_size_with_refs(cpi, wb);
+
+      vp9_wb_write_bit(wb, xd->allow_high_precision_mv);
+
+      fix_mcomp_filter_type(cpi);
+      write_interp_filter_type(cm->mcomp_filter_type, wb);
     }
-    // Signal the type of subpel filter to use
-    vp9_write_bit(&header_bc, (pc->mcomp_filter_type == SWITCHABLE));
-    if (pc->mcomp_filter_type != SWITCHABLE)
-      vp9_write_literal(&header_bc, (pc->mcomp_filter_type), 2);
-#if CONFIG_COMP_INTERINTRA_PRED
-    //  printf("Counts: %d %d\n", cpi->interintra_count[0],
-    //         cpi->interintra_count[1]);
-    if (!cpi->dummy_packing && pc->use_interintra)
-      pc->use_interintra = (cpi->interintra_count[1] > 0);
-    vp9_write_bit(&header_bc, pc->use_interintra);
-    if (!pc->use_interintra)
-      vp9_zero(cpi->interintra_count);
-#endif
   }
 
-  if (!pc->error_resilient_mode) {
-    vp9_write_bit(&header_bc, pc->refresh_entropy_probs);
-    vp9_write_bit(&header_bc, pc->frame_parallel_decoding_mode);
+  if (!cm->error_resilient_mode) {
+    vp9_wb_write_bit(wb, cm->refresh_frame_context);
+    vp9_wb_write_bit(wb, cm->frame_parallel_decoding_mode);
   }
 
-  vp9_write_literal(&header_bc, pc->frame_context_idx,
-                    NUM_FRAME_CONTEXTS_LG2);
+  vp9_wb_write_literal(wb, cm->frame_context_idx, NUM_FRAME_CONTEXTS_LG2);
 
-#ifdef ENTROPY_STATS
-  if (pc->frame_type == INTER_FRAME)
-    active_section = 0;
-  else
-    active_section = 7;
-#endif
+  encode_loopfilter(cm, xd, wb);
+  encode_quantization(cm, wb);
+  encode_segmentation(cpi, wb);
 
-  // If appropriate update the inter mode probability context and code the
-  // changes in the bitstream.
-  if (pc->frame_type != KEY_FRAME) {
-    int i, j;
-    int new_context[INTER_MODE_CONTEXTS][4];
-    if (!cpi->dummy_packing) {
-      update_inter_mode_probs(pc, new_context);
-    } else {
-      // In dummy pack assume context unchanged.
-      vpx_memcpy(new_context, pc->fc.vp9_mode_contexts,
-                 sizeof(pc->fc.vp9_mode_contexts));
-    }
+  write_tile_info(cm, wb);
+}
 
-    for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
-      for (j = 0; j < 4; j++) {
-        if (new_context[i][j] != pc->fc.vp9_mode_contexts[i][j]) {
-          vp9_write(&header_bc, 1, 252);
-          vp9_write_literal(&header_bc, new_context[i][j], 8);
+void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
+  int i, bytes_packed;
+  VP9_COMMON *const pc = &cpi->common;
+  vp9_writer header_bc, residual_bc;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
-          // Only update the persistent copy if this is the "real pack"
-          if (!cpi->dummy_packing) {
-            pc->fc.vp9_mode_contexts[i][j] = new_context[i][j];
-          }
-        } else {
-          vp9_write(&header_bc, 0, 252);
-        }
-      }
-    }
-  }
+  uint8_t *cx_data = dest;
+  struct vp9_write_bit_buffer wb = {dest, 0};
+  struct vp9_write_bit_buffer first_partition_size_wb;
 
-#if CONFIG_NEW_MVREF
-  if ((pc->frame_type != KEY_FRAME)) {
-    int new_mvref_probs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES-1];
-    int i, j;
+  write_uncompressed_header(cpi, &wb);
+  first_partition_size_wb = wb;
+  vp9_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
 
-    update_mv_ref_probs(cpi, new_mvref_probs);
+  bytes_packed = vp9_rb_bytes_written(&wb);
+  cx_data += bytes_packed;
 
-    for (i = 0; i < MAX_REF_FRAMES; ++i) {
-      // Skip the dummy entry for intra ref frame.
-      if (i == INTRA_FRAME) {
-        continue;
-      }
+  compute_update_table();
 
-      // Encode any mandated updates to probabilities
-      for (j = 0; j < MAX_MV_REF_CANDIDATES - 1; ++j) {
-        if (new_mvref_probs[i][j] != xd->mb_mv_ref_probs[i][j]) {
-          vp9_write(&header_bc, 1, VP9_MVREF_UPDATE_PROB);
-          vp9_write_literal(&header_bc, new_mvref_probs[i][j], 8);
+  vp9_start_encode(&header_bc, cx_data);
 
-          // Only update the persistent copy if this is the "real pack"
-          if (!cpi->dummy_packing) {
-            xd->mb_mv_ref_probs[i][j] = new_mvref_probs[i][j];
-          }
-        } else {
-          vp9_write(&header_bc, 0, VP9_MVREF_UPDATE_PROB);
-        }
-      }
-    }
-  }
+#ifdef ENTROPY_STATS
+  if (pc->frame_type == INTER_FRAME)
+    active_section = 0;
+  else
+    active_section = 7;
 #endif
 
   vp9_clear_system_state();  // __asm emms;
 
-  vp9_copy(cpi->common.fc.pre_coef_probs_4x4,
-           cpi->common.fc.coef_probs_4x4);
-  vp9_copy(cpi->common.fc.pre_coef_probs_8x8,
-           cpi->common.fc.coef_probs_8x8);
-  vp9_copy(cpi->common.fc.pre_coef_probs_16x16,
-           cpi->common.fc.coef_probs_16x16);
-  vp9_copy(cpi->common.fc.pre_coef_probs_32x32,
-           cpi->common.fc.coef_probs_32x32);
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_copy(cpi->common.fc.pre_nzc_probs_4x4,
-           cpi->common.fc.nzc_probs_4x4);
-  vp9_copy(cpi->common.fc.pre_nzc_probs_8x8,
-           cpi->common.fc.nzc_probs_8x8);
-  vp9_copy(cpi->common.fc.pre_nzc_probs_16x16,
-           cpi->common.fc.nzc_probs_16x16);
-  vp9_copy(cpi->common.fc.pre_nzc_probs_32x32,
-           cpi->common.fc.nzc_probs_32x32);
-  vp9_copy(cpi->common.fc.pre_nzc_pcat_probs,
-           cpi->common.fc.nzc_pcat_probs);
-  // NOTE that if the counts are reset, we also need to uncomment
-  // the count updates in the write_nzc function
-  /*
-  vp9_zero(cpi->common.fc.nzc_counts_4x4);
-  vp9_zero(cpi->common.fc.nzc_counts_8x8);
-  vp9_zero(cpi->common.fc.nzc_counts_16x16);
-  vp9_zero(cpi->common.fc.nzc_counts_32x32);
-  vp9_zero(cpi->common.fc.nzc_pcat_counts);
-  */
-#endif
-  vp9_copy(cpi->common.fc.pre_sb_ymode_prob, cpi->common.fc.sb_ymode_prob);
-  vp9_copy(cpi->common.fc.pre_ymode_prob, cpi->common.fc.ymode_prob);
-  vp9_copy(cpi->common.fc.pre_uv_mode_prob, cpi->common.fc.uv_mode_prob);
-  vp9_copy(cpi->common.fc.pre_bmode_prob, cpi->common.fc.bmode_prob);
-  vp9_copy(cpi->common.fc.pre_sub_mv_ref_prob, cpi->common.fc.sub_mv_ref_prob);
-  vp9_copy(cpi->common.fc.pre_mbsplit_prob, cpi->common.fc.mbsplit_prob);
-  vp9_copy(cpi->common.fc.pre_i8x8_mode_prob, cpi->common.fc.i8x8_mode_prob);
-  cpi->common.fc.pre_nmvc = cpi->common.fc.nmvc;
-#if CONFIG_COMP_INTERINTRA_PRED
-  cpi->common.fc.pre_interintra_prob = cpi->common.fc.interintra_prob;
-#endif
-  vp9_zero(cpi->sub_mv_ref_count);
-  vp9_zero(cpi->mbsplit_count);
-  vp9_zero(cpi->common.fc.mv_ref_ct)
+  vp9_copy(pc->fc.pre_coef_probs, pc->fc.coef_probs);
+  vp9_copy(pc->fc.pre_y_mode_prob, pc->fc.y_mode_prob);
+  vp9_copy(pc->fc.pre_uv_mode_prob, pc->fc.uv_mode_prob);
+  vp9_copy(pc->fc.pre_partition_prob, pc->fc.partition_prob[INTER_FRAME]);
+  pc->fc.pre_nmvc = pc->fc.nmvc;
+  vp9_copy(pc->fc.pre_switchable_interp_prob, pc->fc.switchable_interp_prob);
+  vp9_copy(pc->fc.pre_inter_mode_probs, pc->fc.inter_mode_probs);
+  vp9_copy(pc->fc.pre_intra_inter_prob, pc->fc.intra_inter_prob);
+  vp9_copy(pc->fc.pre_comp_inter_prob, pc->fc.comp_inter_prob);
+  vp9_copy(pc->fc.pre_comp_ref_prob, pc->fc.comp_ref_prob);
+  vp9_copy(pc->fc.pre_single_ref_prob, pc->fc.single_ref_prob);
+  vp9_copy(pc->fc.pre_tx_probs_8x8p, pc->fc.tx_probs_8x8p);
+  vp9_copy(pc->fc.pre_tx_probs_16x16p, pc->fc.tx_probs_16x16p);
+  vp9_copy(pc->fc.pre_tx_probs_32x32p, pc->fc.tx_probs_32x32p);
+  vp9_copy(pc->fc.pre_mbskip_probs, pc->fc.mbskip_probs);
 
+  if (xd->lossless) {
+    pc->txfm_mode = ONLY_4X4;
+  } else {
+    encode_txfm_probs(cpi, &header_bc);
+  }
+
   update_coef_probs(cpi, &header_bc);
-#if CONFIG_CODE_NONZEROCOUNT
-  update_nzc_probs(cpi, &header_bc);
-#endif
 
 #ifdef ENTROPY_STATS
   active_section = 2;
 #endif
 
-  // Write out the mb_no_coeff_skip flag
-  vp9_write_bit(&header_bc, pc->mb_no_coeff_skip);
-  if (pc->mb_no_coeff_skip) {
-    int k;
+  vp9_update_skip_probs(cpi, &header_bc);
 
-    vp9_update_skip_probs(cpi);
-    for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
-      vp9_write_literal(&header_bc, pc->mbskip_pred_probs[k], 8);
-    }
-  }
-
-  if (pc->frame_type == KEY_FRAME) {
-    if (!pc->kf_ymode_probs_update) {
-      vp9_write_literal(&header_bc, pc->kf_ymode_probs_index, 3);
-    }
-  } else {
-    // Update the probabilities used to encode reference frame data
-    update_ref_probs(cpi);
-
+  if (pc->frame_type != KEY_FRAME) {
 #ifdef ENTROPY_STATS
     active_section = 1;
 #endif
 
+    update_inter_mode_probs(pc, &header_bc);
+    vp9_zero(cpi->common.fc.inter_mode_counts);
+
     if (pc->mcomp_filter_type == SWITCHABLE)
       update_switchable_interp_probs(cpi, &header_bc);
 
-#if CONFIG_COMP_INTERINTRA_PRED
-    if (pc->use_interintra) {
-      vp9_cond_prob_update(&header_bc,
-                           &pc->fc.interintra_prob,
-                           VP9_UPD_INTERINTRA_PROB,
-                           cpi->interintra_count);
-    }
-#endif
+    for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+      vp9_cond_prob_diff_update(&header_bc, &pc->fc.intra_inter_prob[i],
+                                VP9_MODE_UPDATE_PROB,
+                                cpi->intra_inter_count[i]);
 
-    vp9_write_literal(&header_bc, pc->prob_intra_coded, 8);
-    vp9_write_literal(&header_bc, pc->prob_last_coded, 8);
-    vp9_write_literal(&header_bc, pc->prob_gf_coded, 8);
-
-    {
+    if (pc->allow_comp_inter_inter) {
       const int comp_pred_mode = cpi->common.comp_pred_mode;
       const int use_compound_pred = (comp_pred_mode != SINGLE_PREDICTION_ONLY);
       const int use_hybrid_pred = (comp_pred_mode == HYBRID_PREDICTION);
 
-      vp9_write(&header_bc, use_compound_pred, 128);
+      vp9_write_bit(&header_bc, use_compound_pred);
       if (use_compound_pred) {
-        vp9_write(&header_bc, use_hybrid_pred, 128);
+        vp9_write_bit(&header_bc, use_hybrid_pred);
         if (use_hybrid_pred) {
-          for (i = 0; i < COMP_PRED_CONTEXTS; i++) {
-            pc->prob_comppred[i] = get_binary_prob(cpi->single_pred_count[i],
-                                                   cpi->comp_pred_count[i]);
-            vp9_write_literal(&header_bc, pc->prob_comppred[i], 8);
-          }
+          for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+            vp9_cond_prob_diff_update(&header_bc, &pc->fc.comp_inter_prob[i],
+                                      VP9_MODE_UPDATE_PROB,
+                                      cpi->comp_inter_count[i]);
         }
       }
     }
-    update_mbintra_mode_probs(cpi, &header_bc);
 
-    vp9_write_nmv_probs(cpi, xd->allow_high_precision_mv, &header_bc);
-  }
-
-  /* tiling */
-  {
-    int min_log2_tiles, delta_log2_tiles, n_tile_bits, n;
-
-    vp9_get_tile_n_bits(pc, &min_log2_tiles, &delta_log2_tiles);
-    n_tile_bits = pc->log2_tile_columns - min_log2_tiles;
-    for (n = 0; n < delta_log2_tiles; n++) {
-      if (n_tile_bits--) {
-        vp9_write_bit(&header_bc, 1);
-      } else {
-        vp9_write_bit(&header_bc, 0);
-        break;
+    if (pc->comp_pred_mode != COMP_PREDICTION_ONLY) {
+      for (i = 0; i < REF_CONTEXTS; i++) {
+        vp9_cond_prob_diff_update(&header_bc, &pc->fc.single_ref_prob[i][0],
+                                  VP9_MODE_UPDATE_PROB,
+                                  cpi->single_ref_count[i][0]);
+        vp9_cond_prob_diff_update(&header_bc, &pc->fc.single_ref_prob[i][1],
+                                  VP9_MODE_UPDATE_PROB,
+                                  cpi->single_ref_count[i][1]);
       }
     }
-    vp9_write_bit(&header_bc, pc->log2_tile_rows != 0);
-    if (pc->log2_tile_rows != 0)
-      vp9_write_bit(&header_bc, pc->log2_tile_rows != 1);
-  }
 
-  vp9_stop_encode(&header_bc);
+    if (pc->comp_pred_mode != SINGLE_PREDICTION_ONLY) {
+      for (i = 0; i < REF_CONTEXTS; i++)
+        vp9_cond_prob_diff_update(&header_bc, &pc->fc.comp_ref_prob[i],
+                                  VP9_MODE_UPDATE_PROB,
+                                  cpi->comp_ref_count[i]);
+    }
 
-  oh.first_partition_length_in_bytes = header_bc.pos;
+    update_mbintra_mode_probs(cpi, &header_bc);
 
-  /* update frame tag */
-  {
-    int scaling = (pc->width != pc->display_width ||
-                   pc->height != pc->display_height);
-    int v = (oh.first_partition_length_in_bytes << 8) |
-            (scaling << 5) |
-            (oh.show_frame << 4) |
-            (oh.version << 1) |
-            oh.type;
+    for (i = 0; i < NUM_PARTITION_CONTEXTS; ++i) {
+      vp9_prob Pnew[PARTITION_TYPES - 1];
+      unsigned int bct[PARTITION_TYPES - 1][2];
+      update_mode(&header_bc, PARTITION_TYPES, vp9_partition_encodings,
+                  vp9_partition_tree, Pnew,
+                  pc->fc.partition_prob[pc->frame_type][i], bct,
+                  (unsigned int *)cpi->partition_count[i]);
+    }
 
-    assert(oh.first_partition_length_in_bytes <= 0xffff);
-    dest[0] = v;
-    dest[1] = v >> 8;
-    dest[2] = v >> 16;
+    vp9_write_nmv_probs(cpi, xd->allow_high_precision_mv, &header_bc);
   }
 
-  *size = VP9_HEADER_SIZE + extra_bytes_packed + header_bc.pos;
 
-  if (pc->frame_type == KEY_FRAME) {
-    decide_kf_ymode_entropy(cpi);
-  } else {
-    /* This is not required if the counts in cpi are consistent with the
-     * final packing pass */
-    // if (!cpi->dummy_packing) vp9_zero(cpi->NMVcount);
-  }
+  vp9_stop_encode(&header_bc);
 
+
+  // first partition size
+  assert(header_bc.pos <= 0xffff);
+  vp9_wb_write_literal(&first_partition_size_wb, header_bc.pos, 16);
+  *size = bytes_packed + header_bc.pos;
+
   {
     int tile_row, tile_col, total_size = 0;
     unsigned char *data_ptr = cx_data + header_bc.pos;
@@ -2943,11 +1748,8 @@
         write_modes(cpi, &residual_bc, &tok[tile_col], tok_end);
         vp9_stop_encode(&residual_bc);
         if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1) {
-          /* size of this tile */
-          data_ptr[total_size + 0] = residual_bc.pos;
-          data_ptr[total_size + 1] = residual_bc.pos >> 8;
-          data_ptr[total_size + 2] = residual_bc.pos >> 16;
-          data_ptr[total_size + 3] = residual_bc.pos >> 24;
+          // size of this tile
+          write_be32(data_ptr + total_size, residual_bc.pos);
           total_size += 4;
         }
 
@@ -2999,21 +1801,18 @@
   FILE *f = fopen("coefupdprob.h", "w");
   fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");
 
-  print_tree_update_for_type(f, tree_update_hist_4x4, BLOCK_TYPES,
+  print_tree_update_for_type(f, tree_update_hist[TX_4X4],   BLOCK_TYPES,
                              "vp9_coef_update_probs_4x4[BLOCK_TYPES]");
-  print_tree_update_for_type(f, tree_update_hist_8x8, BLOCK_TYPES,
+  print_tree_update_for_type(f, tree_update_hist[TX_8X8],   BLOCK_TYPES,
                              "vp9_coef_update_probs_8x8[BLOCK_TYPES]");
-  print_tree_update_for_type(f, tree_update_hist_16x16, BLOCK_TYPES,
+  print_tree_update_for_type(f, tree_update_hist[TX_16X16], BLOCK_TYPES,
                              "vp9_coef_update_probs_16x16[BLOCK_TYPES]");
-  print_tree_update_for_type(f, tree_update_hist_32x32, BLOCK_TYPES,
+  print_tree_update_for_type(f, tree_update_hist[TX_32X32], BLOCK_TYPES,
                              "vp9_coef_update_probs_32x32[BLOCK_TYPES]");
 
   fclose(f);
   f = fopen("treeupdate.bin", "wb");
-  fwrite(tree_update_hist_4x4, sizeof(tree_update_hist_4x4), 1, f);
-  fwrite(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);
-  fwrite(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);
-  fwrite(tree_update_hist_32x32, sizeof(tree_update_hist_32x32), 1, f);
+  fwrite(tree_update_hist, sizeof(tree_update_hist), 1, f);
   fclose(f);
 }
 #endif
--- a/vp9/encoder/vp9_bitstream.h
+++ b/vp9/encoder/vp9_bitstream.h
@@ -12,6 +12,6 @@
 #ifndef VP9_ENCODER_VP9_BITSTREAM_H_
 #define VP9_ENCODER_VP9_BITSTREAM_H_
 
-void vp9_update_skip_probs(VP9_COMP *cpi);
+void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *bc);
 
 #endif  // VP9_ENCODER_VP9_BITSTREAM_H_
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -23,43 +23,13 @@
   int offset;
 } search_site;
 
-typedef struct block {
-  // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
-  int16_t *src_diff;
-  int16_t *coeff;
-
-  // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
-  int16_t *quant;
-  int16_t *quant_fast;      // fast quant deprecated for now
-  uint8_t *quant_shift;
-  int16_t *zbin;
-  int16_t *zbin_8x8;
-  int16_t *zbin_16x16;
-  int16_t *zbin_32x32;
-  int16_t *zrun_zbin_boost;
-  int16_t *zrun_zbin_boost_8x8;
-  int16_t *zrun_zbin_boost_16x16;
-  int16_t *zrun_zbin_boost_32x32;
-  int16_t *round;
-
-  // Zbin Over Quant value
-  short zbin_extra;
-
-  uint8_t **base_src;
-  uint8_t **base_second_src;
-  int src;
-  int src_stride;
-
-  int skip_block;
-} BLOCK;
-
 typedef struct {
   int count;
   struct {
-    B_PREDICTION_MODE mode;
+    MB_PREDICTION_MODE mode;
     int_mv mv;
     int_mv second_mv;
-  } bmi[16];
+  } bmi[4];
 } PARTITION_INFO;
 
 // Structure to hold snapshot of coding context during the mode picking process
@@ -81,18 +51,36 @@
   int comp_pred_diff;
   int single_pred_diff;
   int64_t txfm_rd_diff[NB_TXFM_MODES];
+
+  // Bit flag for each mode whether it has high error in comparison to others.
+  unsigned int modes_with_high_error;
+
+  // Bit flag for each ref frame whether it has high error compared to others.
+  unsigned int frames_with_high_error;
 } PICK_MODE_CONTEXT;
 
+struct macroblock_plane {
+  DECLARE_ALIGNED(16, int16_t, src_diff[64*64]);
+  DECLARE_ALIGNED(16, int16_t, coeff[64*64]);
+  struct buf_2d src;
+
+  // Quantizer setings
+  int16_t *quant;
+  uint8_t *quant_shift;
+  int16_t *zbin;
+  int16_t *zrun_zbin_boost;
+  int16_t *round;
+
+  // Zbin Over Quant value
+  int16_t zbin_extra;
+};
+
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
-  DECLARE_ALIGNED(16, int16_t, src_diff[64*64+32*32*2]);
-  DECLARE_ALIGNED(16, int16_t, coeff[64*64+32*32*2]);
-  // 16 Y blocks, 4 U blocks, 4 V blocks,
-  BLOCK block[24];
+  struct macroblock_plane plane[MAX_MB_PLANE];
 
-  YV12_BUFFER_CONFIG src;
-
   MACROBLOCKD e_mbd;
+  int skip_block;
   PARTITION_INFO *partition_info; /* work pointer */
   PARTITION_INFO *pi;   /* Corresponds to upper left visible macroblock */
   PARTITION_INFO *pip;  /* Base of allocated array */
@@ -126,11 +114,9 @@
   int *nmvsadcost_hp[2];
   int **mvsadcost;
 
-  int mbmode_cost[2][MB_MODE_COUNT];
+  int mbmode_cost[MB_MODE_COUNT];
   int intra_uv_mode_cost[2][MB_MODE_COUNT];
-  int bmode_costs[VP9_KF_BINTRAMODES][VP9_KF_BINTRAMODES][VP9_KF_BINTRAMODES];
-  int i8x8_mode_costs[MB_MODE_COUNT];
-  int inter_bmode_costs[B_MODE_COUNT];
+  int y_mode_costs[VP9_INTRA_MODES][VP9_INTRA_MODES][VP9_INTRA_MODES];
   int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1]
                              [VP9_SWITCHABLE_FILTERS];
 
@@ -145,36 +131,43 @@
 
   int encode_breakout;
 
-  // char * gf_active_ptr;
-  signed char *gf_active_ptr;
-
   unsigned char *active_ptr;
 
+  // note that token_costs is the cost when eob node is skipped
   vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES];
-#if CONFIG_CODE_NONZEROCOUNT
-  unsigned int nzc_costs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][17];
-  unsigned int nzc_costs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][65];
-  unsigned int nzc_costs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][257];
-  unsigned int nzc_costs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][1025];
-#endif
+  vp9_coeff_count token_costs_noskip[TX_SIZE_MAX_SB][BLOCK_TYPES];
 
   int optimize;
 
-  // Structure to hold context for each of the 4 MBs within a SB:
-  // when encoded as 4 independent MBs:
+  // TODO(jingning): Need to refactor the structure arrays that buffers the
+  // coding mode decisions of each partition type.
+  PICK_MODE_CONTEXT ab4x4_context[4][4][4];
+  PICK_MODE_CONTEXT sb8x4_context[4][4][4];
+  PICK_MODE_CONTEXT sb4x8_context[4][4][4];
+  PICK_MODE_CONTEXT sb8x8_context[4][4][4];
+  PICK_MODE_CONTEXT sb8x16_context[4][4][2];
+  PICK_MODE_CONTEXT sb16x8_context[4][4][2];
   PICK_MODE_CONTEXT mb_context[4][4];
+  PICK_MODE_CONTEXT sb32x16_context[4][2];
+  PICK_MODE_CONTEXT sb16x32_context[4][2];
   // when 4 MBs share coding parameters:
   PICK_MODE_CONTEXT sb32_context[4];
+  PICK_MODE_CONTEXT sb32x64_context[2];
+  PICK_MODE_CONTEXT sb64x32_context[2];
   PICK_MODE_CONTEXT sb64_context;
+  int partition_cost[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 
+  BLOCK_SIZE_TYPE b_partitioning[4][4][4];
+  BLOCK_SIZE_TYPE mb_partitioning[4][4];
+  BLOCK_SIZE_TYPE sb_partitioning[4];
+  BLOCK_SIZE_TYPE sb64_partitioning;
+
   void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);
   void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch);
   void (*fwd_txm8x8)(int16_t *input, int16_t *output, int pitch);
   void (*fwd_txm16x16)(int16_t *input, int16_t *output, int pitch);
-  void (*quantize_b_4x4)(MACROBLOCK *x, int b_idx);
-  void (*quantize_b_4x4_pair)(MACROBLOCK *x, int b_idx1, int b_idx2);
-  void (*quantize_b_16x16)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type);
-  void (*quantize_b_8x8)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type);
+  void (*quantize_b_4x4)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type,
+                         int y_blocks);
 };
 
 #endif  // VP9_ENCODER_VP9_BLOCK_H_
--- a/vp9/encoder/vp9_boolhuff.c
+++ b/vp9/encoder/vp9_boolhuff.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 #include "vp9/encoder/vp9_boolhuff.h"
+#include "vp9/common/vp9_entropy.h"
 
 #if defined(SECTIONBITS_OUTPUT)
 unsigned __int64 Sectionbits[500];
@@ -39,7 +40,7 @@
   22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1
 };
 
-void vp9_start_encode(BOOL_CODER *br, unsigned char *source) {
+void vp9_start_encode(vp9_writer *br, uint8_t *source) {
   br->lowvalue = 0;
   br->range    = 255;
   br->value    = 0;
@@ -46,13 +47,14 @@
   br->count    = -24;
   br->buffer   = source;
   br->pos      = 0;
+  vp9_write_bit(br, 0);
 }
 
-void vp9_stop_encode(BOOL_CODER *br) {
+void vp9_stop_encode(vp9_writer *br) {
   int i;
 
   for (i = 0; i < 32; i++)
-    encode_bool(br, 0, 128);
+    vp9_write_bit(br, 0);
 
   // Ensure there's no ambigous collision with any index marker bytes
   if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0)
@@ -59,107 +61,3 @@
     br->buffer[br->pos++] = 0;
 }
 
-
-void vp9_encode_value(BOOL_CODER *br, int data, int bits) {
-  int bit;
-
-  for (bit = bits - 1; bit >= 0; bit--)
-    encode_bool(br, (1 & (data >> bit)), 0x80);
-}
-
-void vp9_encode_unsigned_max(BOOL_CODER *br, int data, int max) {
-  assert(data <= max);
-  while (max) {
-    encode_bool(br, data & 1, 128);
-    data >>= 1;
-    max >>= 1;
-  }
-}
-
-int vp9_recenter_nonneg(int v, int m) {
-  if (v > (m << 1)) return v;
-  else if (v >= m) return ((v - m) << 1);
-  else return ((m - v) << 1) - 1;
-}
-
-static int get_unsigned_bits(unsigned num_values) {
-  int cat = 0;
-  if ((num_values--) <= 1) return 0;
-  while (num_values > 0) {
-    cat++;
-    num_values >>= 1;
-  }
-  return cat;
-}
-
-void vp9_encode_uniform(BOOL_CODER *br, int v, int n) {
-  int l = get_unsigned_bits(n);
-  int m;
-  if (l == 0) return;
-  m = (1 << l) - n;
-  if (v < m)
-    vp9_encode_value(br, v, l - 1);
-  else {
-    vp9_encode_value(br, m + ((v - m) >> 1), l - 1);
-    vp9_encode_value(br, (v - m) & 1, 1);
-  }
-}
-
-int vp9_count_uniform(int v, int n) {
-  int l = get_unsigned_bits(n);
-  int m;
-  if (l == 0) return 0;
-  m = (1 << l) - n;
-  if (v < m)
-    return l - 1;
-  else
-    return l;
-}
-
-void vp9_encode_term_subexp(BOOL_CODER *br, int word, int k, int num_syms) {
-  int i = 0;
-  int mk = 0;
-  while (1) {
-    int b = (i ? k + i - 1 : k);
-    int a = (1 << b);
-    if (num_syms <= mk + 3 * a) {
-      vp9_encode_uniform(br, word - mk, num_syms - mk);
-      break;
-    } else {
-      int t = (word >= mk + a);
-      vp9_encode_value(br, t, 1);
-      if (t) {
-        i = i + 1;
-        mk += a;
-      } else {
-        vp9_encode_value(br, word - mk, b);
-        break;
-      }
-    }
-  }
-}
-
-int vp9_count_term_subexp(int word, int k, int num_syms) {
-  int count = 0;
-  int i = 0;
-  int mk = 0;
-  while (1) {
-    int b = (i ? k + i - 1 : k);
-    int a = (1 << b);
-    if (num_syms <= mk + 3 * a) {
-      count += vp9_count_uniform(word - mk, num_syms - mk);
-      break;
-    } else {
-      int t = (word >= mk + a);
-      count++;
-      if (t) {
-        i = i + 1;
-        mk += a;
-      } else {
-        count += b;
-        break;
-      }
-    }
-  }
-  return count;
-}
--- a/vp9/encoder/vp9_boolhuff.h
+++ b/vp9/encoder/vp9_boolhuff.h
@@ -27,30 +27,21 @@
   unsigned int value;
   int count;
   unsigned int pos;
-  unsigned char *buffer;
+  uint8_t *buffer;
 
   // Variables used to track bit costs without outputing to the bitstream
   unsigned int  measure_cost;
   unsigned long bit_counter;
-} BOOL_CODER;
+} vp9_writer;
 
-extern void vp9_start_encode(BOOL_CODER *bc, unsigned char *buffer);
-
-extern void vp9_encode_value(BOOL_CODER *br, int data, int bits);
-extern void vp9_encode_unsigned_max(BOOL_CODER *br, int data, int max);
-extern void vp9_stop_encode(BOOL_CODER *bc);
 extern const unsigned int vp9_prob_cost[256];
 
-extern void vp9_encode_uniform(BOOL_CODER *bc, int v, int n);
-extern void vp9_encode_term_subexp(BOOL_CODER *bc, int v, int k, int n);
-extern int vp9_count_uniform(int v, int n);
-extern int vp9_count_term_subexp(int v, int k, int n);
-extern int vp9_recenter_nonneg(int v, int m);
+void vp9_start_encode(vp9_writer *bc, uint8_t *buffer);
+void vp9_stop_encode(vp9_writer *bc);
 
 DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);
 
-
-static void encode_bool(BOOL_CODER *br, int bit, int probability) {
+static void vp9_write(vp9_writer *br, int bit, int probability) {
   unsigned int split;
   int count = br->count;
   unsigned int range = br->range;
@@ -89,7 +80,7 @@
       int x = br->pos - 1;
 
       while (x >= 0 && br->buffer[x] == 0xff) {
-        br->buffer[x] = (unsigned char)0;
+        br->buffer[x] = 0;
         x--;
       }
 
@@ -108,5 +99,17 @@
   br->lowvalue = lowvalue;
   br->range = range;
 }
+
+static void vp9_write_bit(vp9_writer *w, int bit) {
+  vp9_write(w, bit, 128);  // vp9_prob_half
+}
+
+static void vp9_write_literal(vp9_writer *w, int data, int bits) {
+  int bit;
+
+  for (bit = bits - 1; bit >= 0; bit--)
+    vp9_write_bit(w, 1 & (data >> bit));
+}
+
 
 #endif  // VP9_ENCODER_VP9_BOOLHUFF_H_
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -591,23 +591,32 @@
   }
 }
 
+/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
+   pixel. */
 void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {
   int i;
-  int a1, b1, c1, d1;
+  int a1, b1, c1, d1, e1;
   short *ip = input;
   short *op = output;
   int pitch_short = pitch >> 1;
 
   for (i = 0; i < 4; i++) {
-    a1 = ip[0 * pitch_short] + ip[3 * pitch_short];
-    b1 = ip[1 * pitch_short] + ip[2 * pitch_short];
-    c1 = ip[1 * pitch_short] - ip[2 * pitch_short];
-    d1 = ip[0 * pitch_short] - ip[3 * pitch_short];
+    a1 = ip[0 * pitch_short];
+    b1 = ip[1 * pitch_short];
+    c1 = ip[2 * pitch_short];
+    d1 = ip[3 * pitch_short];
 
-    op[0] = (a1 + b1 + 1) >> 1;
-    op[4] = (c1 + d1) >> 1;
-    op[8] = (a1 - b1) >> 1;
-    op[12] = (d1 - c1) >> 1;
+    a1 += b1;
+    d1 = d1 - c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = a1;
+    op[4] = c1;
+    op[8] = d1;
+    op[12] = b1;
 
     ip++;
     op++;
@@ -616,15 +625,22 @@
   op = output;
 
   for (i = 0; i < 4; i++) {
-    a1 = ip[0] + ip[3];
-    b1 = ip[1] + ip[2];
-    c1 = ip[1] - ip[2];
-    d1 = ip[0] - ip[3];
+    a1 = ip[0];
+    b1 = ip[1];
+    c1 = ip[2];
+    d1 = ip[3];
 
-    op[0] = ((a1 + b1 + 1) >> 1) << WHT_UPSCALE_FACTOR;
-    op[1] = ((c1 + d1) >> 1) << WHT_UPSCALE_FACTOR;
-    op[2] = ((a1 - b1) >> 1) << WHT_UPSCALE_FACTOR;
-    op[3] = ((d1 - c1) >> 1) << WHT_UPSCALE_FACTOR;
+    a1 += b1;
+    d1 -= c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = a1 << WHT_UPSCALE_FACTOR;
+    op[1] = c1 << WHT_UPSCALE_FACTOR;
+    op[2] = d1 << WHT_UPSCALE_FACTOR;
+    op[3] = b1 << WHT_UPSCALE_FACTOR;
 
     ip += 4;
     op += 4;
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -10,6 +10,7 @@
 
 
 #include "./vpx_config.h"
+#include "./vp9_rtcd.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
@@ -20,7 +21,6 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/encoder/vp9_segmentation.h"
-#include "vp9/common/vp9_setupintrarecon.h"
 #include "vp9/encoder/vp9_encodeintra.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_invtrans.h"
@@ -47,29 +47,12 @@
 
 void vp9_select_interp_filter_type(VP9_COMP *cpi);
 
-static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
-                              int output_enabled, int mb_row, int mb_col);
+static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
+                              int output_enabled, int mi_row, int mi_col,
+                              BLOCK_SIZE_TYPE bsize);
 
-static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,
-                                int output_enabled, int mb_row, int mb_col);
-
-static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,
-                                int output_enabled, int mb_row, int mb_col);
-
 static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
 
-#ifdef MODE_STATS
-unsigned int inter_y_modes[MB_MODE_COUNT];
-unsigned int inter_uv_modes[VP9_UV_MODES];
-unsigned int inter_b_modes[B_MODE_COUNT];
-unsigned int y_modes[VP9_YMODES];
-unsigned int i8x8_modes[VP9_I8X8_MODES];
-unsigned int uv_modes[VP9_UV_MODES];
-unsigned int uv_modes_y[VP9_YMODES][VP9_UV_MODES];
-unsigned int b_modes[B_MODE_COUNT];
-#endif
-
-
 /* activity_avg must be positive, or flat regions could get a zero weight
  *  (infinite lambda), which confounds analysis.
  * This also avoids the need for divide by zero checks in
@@ -98,8 +81,8 @@
    *  lambda using a non-linear combination (e.g., the smallest, or second
    *  smallest, etc.).
    */
-  act = vp9_variance16x16(x->src.y_buffer, x->src.y_stride, VP9_VAR_OFFS, 0,
-                          &sse);
+  act = vp9_variance16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                          VP9_VAR_OFFS, 0, &sse);
   act <<= 4;
 
   /* If the region is flat, lower the activity some more. */
@@ -115,7 +98,9 @@
   return vp9_encode_intra(cpi, x, use_dc_pred);
 }
 
+DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = { 0 };
 
+
 // Measure the activity of the current macroblock
 // What we measure here is TBD so abstracted to this function
 #define ALT_ACT_MEASURE 1
@@ -280,7 +265,7 @@
     // for each macroblock col in image
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
 #if ALT_ACT_MEASURE
-      xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;
+      xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
       xd->left_available = (mb_col != 0);
       recon_yoffset += 16;
 #endif
@@ -298,19 +283,12 @@
       x->mb_activity_ptr++;
 
       // adjust to the next column of source macroblocks
-      x->src.y_buffer += 16;
+      x->plane[0].src.buf += 16;
     }
 
 
     // adjust to the next row of mbs
-    x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
-
-#if ALT_ACT_MEASURE
-    // extend the recon for intra prediction
-    vp9_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,
-                      xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
-#endif
-
+    x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
   }
 
   // Calculate an "average" MB activity
@@ -347,89 +325,9 @@
   adjust_act_zbin(cpi, x);
 }
 
-#if CONFIG_NEW_MVREF
-static int vp9_cost_mv_ref_id(vp9_prob * ref_id_probs, int mv_ref_id) {
-  int cost;
-
-  // Encode the index for the MV reference.
-  switch (mv_ref_id) {
-    case 0:
-      cost = vp9_cost_zero(ref_id_probs[0]);
-      break;
-    case 1:
-      cost = vp9_cost_one(ref_id_probs[0]);
-      cost += vp9_cost_zero(ref_id_probs[1]);
-      break;
-    case 2:
-      cost = vp9_cost_one(ref_id_probs[0]);
-      cost += vp9_cost_one(ref_id_probs[1]);
-      cost += vp9_cost_zero(ref_id_probs[2]);
-      break;
-    case 3:
-      cost = vp9_cost_one(ref_id_probs[0]);
-      cost += vp9_cost_one(ref_id_probs[1]);
-      cost += vp9_cost_one(ref_id_probs[2]);
-      break;
-
-      // TRAP.. This should not happen
-    default:
-      assert(0);
-      break;
-  }
-  return cost;
-}
-
-// Estimate the cost of each coding the vector using each reference candidate
-static unsigned int pick_best_mv_ref(MACROBLOCK *x,
-                                     MV_REFERENCE_FRAME ref_frame,
-                                     int_mv target_mv,
-                                     int_mv * mv_ref_list,
-                                     int_mv * best_ref) {
-  int i;
-  int best_index = 0;
-  int cost, cost2;
-  int zero_seen = (mv_ref_list[0].as_int) ? FALSE : TRUE;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int max_mv = MV_MAX;
-
-  cost = vp9_cost_mv_ref_id(xd->mb_mv_ref_probs[ref_frame], 0) +
-         vp9_mv_bit_cost(&target_mv, &mv_ref_list[0], x->nmvjointcost,
-                         x->mvcost, 96, xd->allow_high_precision_mv);
-
-  for (i = 1; i < MAX_MV_REF_CANDIDATES; ++i) {
-    // If we see a 0,0 reference vector for a second time we have reached
-    // the end of the list of valid candidate vectors.
-    if (!mv_ref_list[i].as_int) {
-      if (zero_seen)
-        break;
-      else
-        zero_seen = TRUE;
-    }
-
-    // Check for cases where the reference choice would give rise to an
-    // uncodable/out of range residual for row or col.
-    if ((abs(target_mv.as_mv.row - mv_ref_list[i].as_mv.row) > max_mv) ||
-        (abs(target_mv.as_mv.col - mv_ref_list[i].as_mv.col) > max_mv)) {
-      continue;
-    }
-
-    cost2 = vp9_cost_mv_ref_id(xd->mb_mv_ref_probs[ref_frame], i) +
-            vp9_mv_bit_cost(&target_mv, &mv_ref_list[i], x->nmvjointcost,
-                            x->mvcost, 96, xd->allow_high_precision_mv);
-
-    if (cost2 < cost) {
-      cost = cost2;
-      best_index = i;
-    }
-  }
-  best_ref->as_int = mv_ref_list[best_index].as_int;
-
-  return best_index;
-}
-#endif
-
 static void update_state(VP9_COMP *cpi,
-                         PICK_MODE_CONTEXT *ctx, int block_size,
+                         PICK_MODE_CONTEXT *ctx,
+                         BLOCK_SIZE_TYPE bsize,
                          int output_enabled) {
   int i, x_idx, y;
   MACROBLOCK *const x = &cpi->mb;
@@ -436,49 +334,42 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *mi = &ctx->mic;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  int mb_mode = mi->mbmi.mode;
+#if CONFIG_DEBUG || CONFIG_INTERNAL_STATS
+  MB_PREDICTION_MODE mb_mode = mi->mbmi.mode;
+#endif
   int mb_mode_index = ctx->best_mode_index;
   const int mis = cpi->common.mode_info_stride;
-  int mb_block_size = 1 << mi->mbmi.sb_type;
+  const int bh = 1 << mi_height_log2(bsize), bw = 1 << mi_width_log2(bsize);
 
 #if CONFIG_DEBUG
   assert(mb_mode < MB_MODE_COUNT);
   assert(mb_mode_index < MAX_MODES);
-  assert(mi->mbmi.ref_frame < MAX_REF_FRAMES);
+  assert(mi->mbmi.ref_frame[0] < MAX_REF_FRAMES);
+  assert(mi->mbmi.ref_frame[1] < MAX_REF_FRAMES);
 #endif
-  assert(mi->mbmi.sb_type == (block_size >> 5));
 
+  assert(mi->mbmi.sb_type == bsize);
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
-  for (y = 0; y < mb_block_size; y++) {
-    for (x_idx = 0; x_idx < mb_block_size; x_idx++) {
-      if ((xd->mb_to_right_edge >> 7) + mb_block_size > x_idx &&
-          (xd->mb_to_bottom_edge >> 7) + mb_block_size > y) {
+  for (y = 0; y < bh; y++) {
+    for (x_idx = 0; x_idx < bw; x_idx++) {
+      if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > x_idx &&
+          (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > y) {
         MODE_INFO *mi_addr = xd->mode_info_context + x_idx + y * mis;
-
-        vpx_memcpy(mi_addr, mi, sizeof(MODE_INFO));
+        *mi_addr = *mi;
       }
     }
   }
-  if (block_size == 16) {
+  if (bsize < BLOCK_SIZE_SB32X32) {
+    if (bsize < BLOCK_SIZE_MB16X16)
+      ctx->txfm_rd_diff[ALLOW_16X16] = ctx->txfm_rd_diff[ALLOW_8X8];
     ctx->txfm_rd_diff[ALLOW_32X32] = ctx->txfm_rd_diff[ALLOW_16X16];
   }
 
-  if (mb_mode == B_PRED) {
-    for (i = 0; i < 16; i++) {
-      xd->block[i].bmi.as_mode = xd->mode_info_context->bmi[i].as_mode;
-      assert(xd->block[i].bmi.as_mode.first < B_MODE_COUNT);
-    }
-  } else if (mb_mode == I8X8_PRED) {
-    for (i = 0; i < 16; i++) {
-      xd->block[i].bmi = xd->mode_info_context->bmi[i];
-    }
-  } else if (mb_mode == SPLITMV) {
-    vpx_memcpy(x->partition_info, &ctx->partition_info,
-               sizeof(PARTITION_INFO));
-
-    mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;
-    mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;
+  if (mbmi->ref_frame[0] != INTRA_FRAME && mbmi->sb_type < BLOCK_SIZE_SB8X8) {
+    *x->partition_info = ctx->partition_info;
+    mbmi->mv[0].as_int = x->partition_info->bmi[3].mv.as_int;
+    mbmi->mv[1].as_int = x->partition_info->bmi[3].second_mv.as_int;
   }
 
   x->skip = ctx->skip;
@@ -485,18 +376,15 @@
   if (!output_enabled)
     return;
 
-  {
-    int segment_id = mbmi->segment_id;
-    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
-      for (i = 0; i < NB_TXFM_MODES; i++) {
-        cpi->rd_tx_select_diff[i] += ctx->txfm_rd_diff[i];
-      }
+  if (!vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {
+    for (i = 0; i < NB_TXFM_MODES; i++) {
+      cpi->rd_tx_select_diff[i] += ctx->txfm_rd_diff[i];
     }
   }
 
   if (cpi->common.frame_type == KEY_FRAME) {
     // Restore the coding modes to that held in the coding context
-    // if (mb_mode == B_PRED)
+    // if (mb_mode == I4X4_PRED)
     //    for (i = 0; i < 16; i++)
     //    {
     //        xd->block[i].bmi.as_mode =
@@ -515,8 +403,7 @@
       THR_D27_PRED /*D27_PRED*/,
       THR_D63_PRED /*D63_PRED*/,
       THR_TM /*TM_PRED*/,
-      THR_I8X8_PRED /*I8X8_PRED*/,
-      THR_B_PRED /*B_PRED*/,
+      THR_B_PRED /*I4X4_PRED*/,
     };
     cpi->mode_chosen_counts[kf_mode_index[mb_mode]]++;
 #endif
@@ -541,57 +428,34 @@
     */
     // Note how often each mode chosen as best
     cpi->mode_chosen_counts[mb_mode_index]++;
-    if (mbmi->mode == SPLITMV || mbmi->mode == NEWMV) {
+    if (mbmi->ref_frame[0] != INTRA_FRAME &&
+        (mbmi->sb_type < BLOCK_SIZE_SB8X8 || mbmi->mode == NEWMV)) {
       int_mv best_mv, best_second_mv;
-      MV_REFERENCE_FRAME rf = mbmi->ref_frame;
-#if CONFIG_NEW_MVREF
-      unsigned int best_index;
-      MV_REFERENCE_FRAME sec_ref_frame = mbmi->second_ref_frame;
-#endif
+      const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1];
       best_mv.as_int = ctx->best_ref_mv.as_int;
       best_second_mv.as_int = ctx->second_best_ref_mv.as_int;
       if (mbmi->mode == NEWMV) {
-        best_mv.as_int = mbmi->ref_mvs[rf][0].as_int;
-        best_second_mv.as_int = mbmi->ref_mvs[mbmi->second_ref_frame][0].as_int;
-#if CONFIG_NEW_MVREF
-        best_index = pick_best_mv_ref(x, rf, mbmi->mv[0],
-                                      mbmi->ref_mvs[rf], &best_mv);
-        mbmi->best_index = best_index;
-        ++cpi->mb_mv_ref_count[rf][best_index];
-
-        if (mbmi->second_ref_frame > 0) {
-          unsigned int best_index;
-          best_index =
-              pick_best_mv_ref(x, sec_ref_frame, mbmi->mv[1],
-                               mbmi->ref_mvs[sec_ref_frame],
-                               &best_second_mv);
-          mbmi->best_second_index = best_index;
-          ++cpi->mb_mv_ref_count[sec_ref_frame][best_index];
-        }
-#endif
+        best_mv.as_int = mbmi->ref_mvs[rf1][0].as_int;
+        best_second_mv.as_int = mbmi->ref_mvs[rf2][0].as_int;
       }
       mbmi->best_mv.as_int = best_mv.as_int;
       mbmi->best_second_mv.as_int = best_second_mv.as_int;
       vp9_update_nmv_count(cpi, x, &best_mv, &best_second_mv);
     }
-#if CONFIG_COMP_INTERINTRA_PRED
-    if (mbmi->mode >= NEARESTMV && mbmi->mode < SPLITMV &&
-        mbmi->second_ref_frame <= INTRA_FRAME) {
-      if (mbmi->second_ref_frame == INTRA_FRAME) {
-        ++cpi->interintra_count[1];
-        ++cpi->ymode_count[mbmi->interintra_mode];
-#if SEPARATE_INTERINTRA_UV
-        ++cpi->y_uv_mode_count[mbmi->interintra_mode][mbmi->interintra_uv_mode];
-#endif
-      } else {
-        ++cpi->interintra_count[0];
-      }
+
+    if (bsize > BLOCK_SIZE_SB8X8 && mbmi->mode == NEWMV) {
+      int i, j;
+      for (j = 0; j < bh; ++j)
+        for (i = 0; i < bw; ++i)
+          if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > i &&
+              (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > j)
+            xd->mode_info_context[mis * j + i].mbmi = *mbmi;
     }
-#endif
+
     if (cpi->common.mcomp_filter_type == SWITCHABLE &&
-        mbmi->mode >= NEARESTMV &&
-        mbmi->mode <= SPLITMV) {
-      ++cpi->switchable_interp_count
+        is_inter_mode(mbmi->mode)) {
+      ++cpi->common.fc.switchable_interp_count
           [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
           [vp9_switchable_interp_map[mbmi->interp_filter]];
     }
@@ -602,14 +466,16 @@
   }
 }
 
-static unsigned find_seg_id(uint8_t *buf, int block_size,
+static unsigned find_seg_id(VP9_COMMON *cm, uint8_t *buf, BLOCK_SIZE_TYPE bsize,
                             int start_y, int height, int start_x, int width) {
-  const int end_x = MIN(start_x + block_size, width);
-  const int end_y = MIN(start_y + block_size, height);
+  const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
+  const int end_x = MIN(start_x + bw, width);
+  const int end_y = MIN(start_y + bh, height);
   int x, y;
   unsigned seg_id = -1;
 
   buf += width * start_y;
+  assert(start_y < cm->mi_rows && start_x < cm->cur_tile_mi_col_end);
   for (y = start_y; y < end_y; y++, buf += width) {
     for (x = start_x; x < end_x; x++) {
       seg_id = MIN(seg_id, buf[x]);
@@ -619,22 +485,48 @@
   return seg_id;
 }
 
+void vp9_setup_src_planes(MACROBLOCK *x,
+                          const YV12_BUFFER_CONFIG *src,
+                          int mb_row, int mb_col) {
+  uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                         src->alpha_buffer};
+  int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                    src->alpha_stride};
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    setup_pred_plane(&x->plane[i].src,
+                     buffers[i], strides[i],
+                     mb_row, mb_col, NULL,
+                     x->e_mbd.plane[i].subsampling_x,
+                     x->e_mbd.plane[i].subsampling_y);
+  }
+}
+
 static void set_offsets(VP9_COMP *cpi,
-                        int mb_row, int mb_col, int block_size) {
+                        int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) {
   MACROBLOCK *const x = &cpi->mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
   const int dst_fb_idx = cm->new_fb_idx;
+  const int idx_str = xd->mode_info_stride * mi_row + mi_col;
+  const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
+  const int mb_row = mi_row >> 1;
+  const int mb_col = mi_col >> 1;
   const int idx_map = mb_row * cm->mb_cols + mb_col;
-  const int idx_str = xd->mode_info_stride * mb_row + mb_col;
+  int i;
 
   // entropy context structures
-  xd->above_context = cm->above_context + mb_col;
-  xd->left_context  = cm->left_context + (mb_row & 3);
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].above_context = cm->above_context[i] +
+        (mi_col * 2 >>  xd->plane[i].subsampling_x);
+    xd->plane[i].left_context = cm->left_context[i] +
+        (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y);
+  }
 
-  // GF active flags data structure
-  x->gf_active_ptr = (signed char *)&cpi->gf_active_flags[idx_map];
+  // partition contexts
+  set_partition_seg_context(cm, xd, mi_row, mi_col);
 
   // Activity map pointer
   x->mb_activity_ptr = &cpi->mb_activity_map[idx_map];
@@ -644,30 +536,29 @@
   x->partition_info          = x->pi + idx_str;
   xd->mode_info_context      = cm->mi + idx_str;
   mbmi = &xd->mode_info_context->mbmi;
-  xd->prev_mode_info_context = cm->prev_mi + idx_str;
+  // Special case: if prev_mi is NULL, the previous mode info context
+  // cannot be used.
+  xd->prev_mode_info_context = cm->prev_mi ?
+                                 cm->prev_mi + idx_str : NULL;
 
   // Set up destination pointers
-  setup_pred_block(&xd->dst,
-                   &cm->yv12_fb[dst_fb_idx],
-                   mb_row, mb_col, NULL, NULL);
+  setup_dst_planes(xd, &cm->yv12_fb[dst_fb_idx], mi_row, mi_col);
 
   /* Set up limit values for MV components to prevent them from
    * extending beyond the UMV borders assuming 16x16 block size */
-  x->mv_row_min = -((mb_row * 16) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
-  x->mv_col_min = -((mb_col * 16) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
-  x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
-                   (VP9BORDERINPIXELS - block_size - VP9_INTERP_EXTEND));
-  x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
-                   (VP9BORDERINPIXELS - block_size - VP9_INTERP_EXTEND));
+  x->mv_row_min = -((mi_row * MI_SIZE) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
+  x->mv_col_min = -((mi_col * MI_SIZE) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
+  x->mv_row_max = ((cm->mi_rows - mi_row) * MI_SIZE +
+                   (VP9BORDERINPIXELS - MI_SIZE * bh - VP9_INTERP_EXTEND));
+  x->mv_col_max = ((cm->mi_cols - mi_col) * MI_SIZE +
+                   (VP9BORDERINPIXELS - MI_SIZE * bw - VP9_INTERP_EXTEND));
 
   // Set up distance of MB to edge of frame in 1/8th pel units
-  block_size >>= 4;  // in macroblock units
-  assert(!(mb_col & (block_size - 1)) && !(mb_row & (block_size - 1)));
-  set_mb_row(cm, xd, mb_row, block_size);
-  set_mb_col(cm, xd, mb_col, block_size);
+  assert(!(mi_col & (bw - 1)) && !(mi_row & (bh - 1)));
+  set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
 
   /* set up source buffers */
-  setup_pred_block(&x->src, cpi->Source, mb_row, mb_col, NULL, NULL);
+  vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
 
   /* R/D setup */
   x->rddiv = cpi->RDDIV;
@@ -675,23 +566,17 @@
 
   /* segment ID */
   if (xd->segmentation_enabled) {
-    if (xd->update_mb_segmentation_map) {
-      mbmi->segment_id = find_seg_id(cpi->segmentation_map, block_size,
-                                     mb_row, cm->mb_rows, mb_col, cm->mb_cols);
-    } else {
-      mbmi->segment_id = find_seg_id(cm->last_frame_seg_map, block_size,
-                                     mb_row, cm->mb_rows, mb_col, cm->mb_cols);
-    }
-    assert(mbmi->segment_id <= 3);
+    uint8_t *map = xd->update_mb_segmentation_map ? cpi->segmentation_map
+                                                  : cm->last_frame_seg_map;
+    mbmi->segment_id = find_seg_id(cm, map, bsize, mi_row,
+                                   cm->mi_rows, mi_col, cm->mi_cols);
+
+    assert(mbmi->segment_id <= (MAX_MB_SEGMENTS-1));
     vp9_mb_init_quantizer(cpi, x);
 
     if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
         !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
-        vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME) &&
-        vp9_check_segref(xd, 1, INTRA_FRAME)  +
-        vp9_check_segref(xd, 1, LAST_FRAME)   +
-        vp9_check_segref(xd, 1, GOLDEN_FRAME) +
-        vp9_check_segref(xd, 1, ALTREF_FRAME) == 1) {
+        vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {
       cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
     } else {
       const int y = mb_row & ~3;
@@ -698,8 +583,10 @@
       const int x = mb_col & ~3;
       const int p16 = ((mb_row & 1) << 1) +  (mb_col & 1);
       const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1);
-      const int tile_progress = cm->cur_tile_mb_col_start * cm->mb_rows;
-      const int mb_cols = cm->cur_tile_mb_col_end - cm->cur_tile_mb_col_start;
+      const int tile_progress =
+          cm->cur_tile_mi_col_start * cm->mb_rows >> 1;
+      const int mb_cols =
+          (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start) >> 1;
 
       cpi->seg0_progress =
           ((y * mb_cols + x * 4 + p32 + p16 + tile_progress) << 16) / cm->MBs;
@@ -709,453 +596,824 @@
   }
 }
 
-static int pick_mb_modes(VP9_COMP *cpi,
-                         int mb_row0,
-                         int mb_col0,
-                         TOKENEXTRA **tp,
-                         int *totalrate,
-                         int *totaldist) {
+static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
+                          TOKENEXTRA **tp, int *totalrate, int *totaldist,
+                          BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  int i;
-  int splitmodes_used = 0;
-  ENTROPY_CONTEXT_PLANES left_context[2];
-  ENTROPY_CONTEXT_PLANES above_context[2];
-  ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context
-                                                      + mb_col0;
 
-  /* Function should not modify L & A contexts; save and restore on exit */
-  vpx_memcpy(left_context,
-             cm->left_context + (mb_row0 & 2),
-             sizeof(left_context));
-  vpx_memcpy(above_context,
-             initial_above_context_ptr,
-             sizeof(above_context));
+  if (bsize < BLOCK_SIZE_SB8X8)
+    if (xd->ab_index != 0)
+      return;
 
-  /* Encode MBs in raster order within the SB */
-  for (i = 0; i < 4; i++) {
-    const int x_idx = i & 1, y_idx = i >> 1;
-    const int mb_row = mb_row0 + y_idx;
-    const int mb_col = mb_col0 + x_idx;
-    MB_MODE_INFO *mbmi;
+  set_offsets(cpi, mi_row, mi_col, bsize);
+  xd->mode_info_context->mbmi.sb_type = bsize;
+  if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
+    vp9_activity_masking(cpi, x);
 
-    if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {
-      // MB lies outside frame, move on
-      continue;
-    }
+  /* Find best coding mode & reconstruct the MB so it is available
+   * as a predictor for MBs that follow in the SB */
+  if (cm->frame_type == KEY_FRAME) {
+    vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist, bsize, ctx);
+  } else {
+    vp9_rd_pick_inter_mode_sb(cpi, x, mi_row, mi_col, totalrate, totaldist,
+                              bsize, ctx);
+  }
+}
 
-    // Index of the MB in the SB 0..3
-    xd->mb_index = i;
-    set_offsets(cpi, mb_row, mb_col, 16);
+static void update_stats(VP9_COMP *cpi, int mi_row, int mi_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mode_info_context;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
 
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
-      vp9_activity_masking(cpi, x);
+  if (cm->frame_type != KEY_FRAME) {
+    int segment_id, seg_ref_active;
 
-    mbmi = &xd->mode_info_context->mbmi;
-    mbmi->sb_type = BLOCK_SIZE_MB16X16;
+    segment_id = mbmi->segment_id;
+    seg_ref_active = vp9_segfeature_active(xd, segment_id,
+                                           SEG_LVL_REF_FRAME);
 
-    // Find best coding mode & reconstruct the MB so it is available
-    // as a predictor for MBs that follow in the SB
-    if (cm->frame_type == KEY_FRAME) {
-      int r, d;
-#if 0  // ENC_DEBUG
-      if (enc_debug)
-        printf("intra pick_mb_modes %d %d\n", mb_row, mb_col);
-#endif
-      vp9_rd_pick_intra_mode(cpi, x, &r, &d);
-      *totalrate += r;
-      *totaldist += d;
+    if (!seg_ref_active)
+      cpi->intra_inter_count[vp9_get_pred_context(cm, xd, PRED_INTRA_INTER)]
+                            [mbmi->ref_frame[0] > INTRA_FRAME]++;
 
-      // Dummy encode, do not do the tokenization
-      encode_macroblock(cpi, tp, 0, mb_row, mb_col);
+    // If the segment reference feature is enabled we have only a single
+    // reference frame allowed for the segment so exclude it from
+    // the reference frame counts used to work out probabilities.
+    if ((mbmi->ref_frame[0] > INTRA_FRAME) && !seg_ref_active) {
+      if (cm->comp_pred_mode == HYBRID_PREDICTION)
+        cpi->comp_inter_count[vp9_get_pred_context(cm, xd,
+                                                   PRED_COMP_INTER_INTER)]
+                             [mbmi->ref_frame[1] > INTRA_FRAME]++;
 
-      // Note the encoder may have changed the segment_id
-
-      // Save the coding context
-      vpx_memcpy(&x->mb_context[xd->sb_index][i].mic, xd->mode_info_context,
-                 sizeof(MODE_INFO));
-    } else {
-      int seg_id, r, d;
-
-#if 0  // ENC_DEBUG
-      if (enc_debug)
-        printf("inter pick_mb_modes %d %d\n", mb_row, mb_col);
-#endif
-      vp9_pick_mode_inter_macroblock(cpi, x, mb_row, mb_col, &r, &d);
-      *totalrate += r;
-      *totaldist += d;
-
-      splitmodes_used += (mbmi->mode == SPLITMV);
-
-      // Dummy encode, do not do the tokenization
-      encode_macroblock(cpi, tp, 0, mb_row, mb_col);
-
-      seg_id = mbmi->segment_id;
-      if (cpi->mb.e_mbd.segmentation_enabled && seg_id == 0) {
-        cpi->seg0_idx++;
+      if (mbmi->ref_frame[1] > INTRA_FRAME) {
+        cpi->comp_ref_count[vp9_get_pred_context(cm, xd, PRED_COMP_REF_P)]
+                           [mbmi->ref_frame[0] == GOLDEN_FRAME]++;
+      } else {
+        cpi->single_ref_count[vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P1)]
+                             [0][mbmi->ref_frame[0] != LAST_FRAME]++;
+        if (mbmi->ref_frame[0] != LAST_FRAME)
+          cpi->single_ref_count[vp9_get_pred_context(cm, xd,
+                                                     PRED_SINGLE_REF_P2)]
+                               [1][mbmi->ref_frame[0] != GOLDEN_FRAME]++;
       }
-      if (!xd->segmentation_enabled ||
-          !vp9_segfeature_active(xd, seg_id, SEG_LVL_REF_FRAME) ||
-          vp9_check_segref(xd, seg_id, INTRA_FRAME)  +
-          vp9_check_segref(xd, seg_id, LAST_FRAME)   +
-          vp9_check_segref(xd, seg_id, GOLDEN_FRAME) +
-          vp9_check_segref(xd, seg_id, ALTREF_FRAME) > 1) {
-        // Get the prediction context and status
-        int pred_flag = vp9_get_pred_flag(xd, PRED_REF);
-        int pred_context = vp9_get_pred_context(cm, xd, PRED_REF);
-
-        // Count prediction success
-        cpi->ref_pred_count[pred_context][pred_flag]++;
-      }
     }
+    // Count of last ref frame 0,0 usage
+    if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame[0] == LAST_FRAME))
+      cpi->inter_zz_count++;
   }
+}
 
-  /* Restore L & A coding context to those in place on entry */
-  vpx_memcpy(cm->left_context + (mb_row0 & 2),
-             left_context,
-             sizeof(left_context));
-  vpx_memcpy(initial_above_context_ptr,
-             above_context,
-             sizeof(above_context));
+// TODO(jingning): the variables used here are little complicated. need further
+// refactoring on organizing the the temporary buffers, when recursive
+// partition down to 4x4 block size is enabled.
+static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
+                                            BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
 
-  return splitmodes_used;
+  switch (bsize) {
+    case BLOCK_SIZE_SB64X64:
+      return &x->sb64_context;
+    case BLOCK_SIZE_SB64X32:
+      return &x->sb64x32_context[xd->sb_index];
+    case BLOCK_SIZE_SB32X64:
+      return &x->sb32x64_context[xd->sb_index];
+    case BLOCK_SIZE_SB32X32:
+      return &x->sb32_context[xd->sb_index];
+    case BLOCK_SIZE_SB32X16:
+      return &x->sb32x16_context[xd->sb_index][xd->mb_index];
+    case BLOCK_SIZE_SB16X32:
+      return &x->sb16x32_context[xd->sb_index][xd->mb_index];
+    case BLOCK_SIZE_MB16X16:
+      return &x->mb_context[xd->sb_index][xd->mb_index];
+    case BLOCK_SIZE_SB16X8:
+      return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_SIZE_SB8X16:
+      return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_SIZE_SB8X8:
+      return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_SIZE_SB8X4:
+      return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_SIZE_SB4X8:
+      return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_SIZE_AB4X4:
+      return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
+    default:
+      assert(0);
+      return NULL;
+  }
 }
 
-static void pick_sb_modes(VP9_COMP *cpi,
-                          int mb_row,
-                          int mb_col,
-                          TOKENEXTRA **tp,
-                          int *totalrate,
-                          int *totaldist) {
+static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x,
+                                            BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  switch (bsize) {
+    case BLOCK_SIZE_SB64X64:
+      return &x->sb64_partitioning;
+    case BLOCK_SIZE_SB32X32:
+      return &x->sb_partitioning[xd->sb_index];
+    case BLOCK_SIZE_MB16X16:
+      return &x->mb_partitioning[xd->sb_index][xd->mb_index];
+    case BLOCK_SIZE_SB8X8:
+      return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index];
+    default:
+      assert(0);
+      return NULL;
+  }
+}
+
+static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
+                            ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
+                            ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
+                            PARTITION_CONTEXT sa[8],
+                            PARTITION_CONTEXT sl[8],
+                            BLOCK_SIZE_TYPE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
+  int p;
+  int bwl = b_width_log2(bsize), bw = 1 << bwl;
+  int bhl = b_height_log2(bsize), bh = 1 << bhl;
+  int mwl = mi_width_log2(bsize), mw = 1 << mwl;
+  int mhl = mi_height_log2(bsize), mh = 1 << mhl;
+  for (p = 0; p < MAX_MB_PLANE; p++) {
+    vpx_memcpy(cm->above_context[p] +
+               ((mi_col * 2) >> xd->plane[p].subsampling_x),
+               a + bw * p,
+               sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
+    vpx_memcpy(cm->left_context[p] +
+               ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+               l + bh * p,
+               sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
+  }
+  vpx_memcpy(cm->above_seg_context + mi_col, sa,
+             sizeof(PARTITION_CONTEXT) * mw);
+  vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl,
+             sizeof(PARTITION_CONTEXT) * mh);
+}
+static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
+                          ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
+                          ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
+                          PARTITION_CONTEXT sa[8],
+                          PARTITION_CONTEXT sl[8],
+                          BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int p;
+  int bwl = b_width_log2(bsize), bw = 1 << bwl;
+  int bhl = b_height_log2(bsize), bh = 1 << bhl;
+  int mwl = mi_width_log2(bsize), mw = 1 << mwl;
+  int mhl = mi_height_log2(bsize), mh = 1 << mhl;
 
-  set_offsets(cpi, mb_row, mb_col, 32);
-  xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB32X32;
-  if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
-    vp9_activity_masking(cpi, x);
-
-  /* Find best coding mode & reconstruct the MB so it is available
-   * as a predictor for MBs that follow in the SB */
-  if (cm->frame_type == KEY_FRAME) {
-    vp9_rd_pick_intra_mode_sb32(cpi, x,
-                                totalrate,
-                                totaldist);
-
-    /* Save the coding context */
-    vpx_memcpy(&x->sb32_context[xd->sb_index].mic, xd->mode_info_context,
-               sizeof(MODE_INFO));
-  } else {
-    vp9_rd_pick_inter_mode_sb32(cpi, x, mb_row, mb_col, totalrate, totaldist);
+  // buffer the above/left context information of the block in search.
+  for (p = 0; p < MAX_MB_PLANE; ++p) {
+    vpx_memcpy(a + bw * p, cm->above_context[p] +
+               (mi_col * 2 >> xd->plane[p].subsampling_x),
+               sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
+    vpx_memcpy(l + bh * p, cm->left_context[p] +
+               ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+               sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
   }
+  vpx_memcpy(sa, cm->above_seg_context + mi_col,
+             sizeof(PARTITION_CONTEXT) * mw);
+  vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),
+             sizeof(PARTITION_CONTEXT) * mh);
 }
 
-static void pick_sb64_modes(VP9_COMP *cpi,
-                            int mb_row,
-                            int mb_col,
-                            TOKENEXTRA **tp,
-                            int *totalrate,
-                            int *totaldist) {
+static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
+                     int mi_row, int mi_col, int output_enabled,
+                     BLOCK_SIZE_TYPE bsize, int sub_index) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  set_offsets(cpi, mb_row, mb_col, 64);
-  xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB64X64;
-  if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
-    vp9_activity_masking(cpi, x);
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
 
-  /* Find best coding mode & reconstruct the MB so it is available
-   * as a predictor for MBs that follow in the SB */
-  if (cm->frame_type == KEY_FRAME) {
-    vp9_rd_pick_intra_mode_sb64(cpi, x, totalrate, totaldist);
+  if (sub_index != -1)
+    *(get_sb_index(xd, bsize)) = sub_index;
 
-    /* Save the coding context */
-    vpx_memcpy(&x->sb64_context.mic, xd->mode_info_context, sizeof(MODE_INFO));
-  } else {
-    vp9_rd_pick_inter_mode_sb64(cpi, x, mb_row, mb_col, totalrate, totaldist);
+  if (bsize < BLOCK_SIZE_SB8X8)
+    if (xd->ab_index > 0)
+      return;
+  set_offsets(cpi, mi_row, mi_col, bsize);
+  update_state(cpi, get_block_context(x, bsize), bsize, output_enabled);
+  encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);
+
+  if (output_enabled) {
+    update_stats(cpi, mi_row, mi_col);
+
+    (*tp)->token = EOSB_TOKEN;
+    (*tp)++;
   }
 }
 
-static void update_stats(VP9_COMP *cpi, int mb_row, int mb_col) {
+static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
+                      int mi_row, int mi_col, int output_enabled,
+                      BLOCK_SIZE_TYPE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mode_info_context;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  BLOCK_SIZE_TYPE c1 = BLOCK_SIZE_SB8X8;
+  const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4;
+  int bwl, bhl;
+  int UNINITIALIZED_IS_SAFE(pl);
 
-  if (cm->frame_type == KEY_FRAME) {
-#ifdef MODE_STATS
-    y_modes[mbmi->mode]++;
-#endif
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  c1 = BLOCK_SIZE_AB4X4;
+  if (bsize >= BLOCK_SIZE_SB8X8) {
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    pl = partition_plane_context(xd, bsize);
+    c1 = *(get_sb_partitioning(x, bsize));
+  }
+
+  bwl = b_width_log2(c1), bhl = b_height_log2(c1);
+
+  if (bsl == bwl && bsl == bhl) {
+    if (output_enabled && bsize >= BLOCK_SIZE_SB8X8)
+        cpi->partition_count[pl][PARTITION_NONE]++;
+    encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
+  } else if (bsl == bhl && bsl > bwl) {
+    if (output_enabled)
+      cpi->partition_count[pl][PARTITION_VERT]++;
+    encode_b(cpi, tp, mi_row, mi_col,      output_enabled, c1, 0);
+    encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1);
+  } else if (bsl == bwl && bsl > bhl) {
+    if (output_enabled)
+      cpi->partition_count[pl][PARTITION_HORZ]++;
+    encode_b(cpi, tp, mi_row,      mi_col, output_enabled, c1, 0);
+    encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1);
   } else {
-    int segment_id, seg_ref_active;
+    BLOCK_SIZE_TYPE subsize;
+    int i;
 
-    if (mbmi->ref_frame) {
-      int pred_context = vp9_get_pred_context(cm, xd, PRED_COMP);
+    assert(bwl < bsl && bhl < bsl);
+    subsize = get_subsize(bsize, PARTITION_SPLIT);
 
-      if (mbmi->second_ref_frame <= INTRA_FRAME)
-        cpi->single_pred_count[pred_context]++;
-      else
-        cpi->comp_pred_count[pred_context]++;
+    if (output_enabled)
+      cpi->partition_count[pl][PARTITION_SPLIT]++;
+
+    for (i = 0; i < 4; i++) {
+      const int x_idx = i & 1, y_idx = i >> 1;
+
+      *(get_sb_index(xd, subsize)) = i;
+      encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
+                output_enabled, subsize);
     }
+  }
 
-#ifdef MODE_STATS
-    inter_y_modes[mbmi->mode]++;
+  if (bsize >= BLOCK_SIZE_SB8X8 &&
+      (bsize == BLOCK_SIZE_SB8X8 || bsl == bwl || bsl == bhl)) {
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    update_partition_context(xd, c1, bsize);
+  }
+}
 
-    if (mbmi->mode == SPLITMV) {
-      int b;
+static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m,
+                             BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int mis = cm->mode_info_stride;
+  int bsl = b_width_log2(bsize);
+  int bs = (1 << bsl) / 2;  //
+  int block_row, block_col;
+  int row, col;
 
-      for (b = 0; b < x->partition_info->count; b++) {
-        inter_b_modes[x->partition_info->bmi[b].mode]++;
+  // this test function sets the entire macroblock to the same bsize
+  for (block_row = 0; block_row < 8; block_row += bs) {
+    for (block_col = 0; block_col < 8; block_col += bs) {
+      for (row = 0; row < bs; row++) {
+        for (col = 0; col < bs; col++) {
+          m[(block_row+row)*mis + block_col+col].mbmi.sb_type = bsize;
+        }
       }
     }
-#endif
+  }
+}
 
-    // If we have just a single reference frame coded for a segment then
-    // exclude from the reference frame counts used to work out
-    // probabilities. NOTE: At the moment we dont support custom trees
-    // for the reference frame coding for each segment but this is a
-    // possible future action.
-    segment_id = mbmi->segment_id;
-    seg_ref_active = vp9_segfeature_active(xd, segment_id,
-                                           SEG_LVL_REF_FRAME);
-    if (!seg_ref_active ||
-        ((vp9_check_segref(xd, segment_id, INTRA_FRAME) +
-          vp9_check_segref(xd, segment_id, LAST_FRAME) +
-          vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +
-          vp9_check_segref(xd, segment_id, ALTREF_FRAME)) > 1)) {
-      cpi->count_mb_ref_frame_usage[mbmi->ref_frame]++;
+static void set_block_size(VP9_COMMON *const cm,
+                           MODE_INFO *m, BLOCK_SIZE_TYPE bsize, int mis,
+                           int mi_row, int mi_col) {
+  int row, col;
+  int bwl = b_width_log2(bsize);
+  int bhl = b_height_log2(bsize);
+  int bsl = (bwl > bhl ? bwl : bhl);
+
+  int bs = (1 << bsl) / 2;  //
+  MODE_INFO *m2 = m + mi_row * mis + mi_col;
+  for (row = 0; row < bs; row++) {
+    for (col = 0; col < bs; col++) {
+      if (mi_row + row >= cm->mi_rows || mi_col + col >= cm->mi_cols)
+        continue;
+      m2[row*mis+col].mbmi.sb_type = bsize;
     }
-    // Count of last ref frame 0,0 usage
-    if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))
-      cpi->inter_zz_count++;
   }
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_update_nzc_counts(&cpi->common, xd, mb_row, mb_col);
-#endif
 }
+typedef struct {
+  int64_t sum_square_error;
+  int64_t sum_error;
+  int count;
+  int variance;
+} var;
 
-static void encode_sb(VP9_COMP *cpi,
-                      int mb_row,
-                      int mb_col,
-                      int output_enabled,
-                      TOKENEXTRA **tp, int is_sb) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
+#define VT(TYPE, BLOCKSIZE) \
+  typedef struct { \
+    var none; \
+    var horz[2]; \
+    var vert[2]; \
+    BLOCKSIZE split[4]; } TYPE;
 
-  cpi->sb32_count[is_sb]++;
-  if (is_sb) {
-    set_offsets(cpi, mb_row, mb_col, 32);
-    update_state(cpi, &x->sb32_context[xd->sb_index], 32, output_enabled);
+VT(v8x8, var)
+VT(v16x16, v8x8)
+VT(v32x32, v16x16)
+VT(v64x64, v32x32)
 
-    encode_superblock32(cpi, tp,
-                        output_enabled, mb_row, mb_col);
-    if (output_enabled) {
-      update_stats(cpi, mb_row, mb_col);
-    }
+typedef enum {
+  V16X16,
+  V32X32,
+  V64X64,
+} TREE_LEVEL;
 
-    if (output_enabled) {
-      (*tp)->Token = EOSB_TOKEN;
-      (*tp)++;
-      if (mb_row < cm->mb_rows)
-        cpi->tplist[mb_row].stop = *tp;
-    }
-  } else {
-    int i;
+// Set variance values given sum square error, sum error, count.
+static void fill_variance(var *v, int64_t s2, int64_t s, int c) {
+  v->sum_square_error = s2;
+  v->sum_error = s;
+  v->count = c;
+  v->variance = 256
+      * (v->sum_square_error - v->sum_error * v->sum_error / v->count)
+      / v->count;
+}
 
-    for (i = 0; i < 4; i++) {
-      const int x_idx = i & 1, y_idx = i >> 1;
+// Combine 2 variance structures by summing the sum_error, sum_square_error,
+// and counts and then calculating the new variance.
+void sum_2_variances(var *r, var *a, var*b) {
+  fill_variance(r, a->sum_square_error + b->sum_square_error,
+                a->sum_error + b->sum_error, a->count + b->count);
+}
+// Fill one level of our variance tree,  by summing the split sums into each of
+// the horizontal, vertical and none from split and recalculating variance.
+#define fill_variance_tree(VT) \
+  sum_2_variances(VT.horz[0], VT.split[0].none, VT.split[1].none); \
+  sum_2_variances(VT.horz[1], VT.split[2].none, VT.split[3].none); \
+  sum_2_variances(VT.vert[0], VT.split[0].none, VT.split[2].none); \
+  sum_2_variances(VT.vert[1], VT.split[1].none, VT.split[3].none); \
+  sum_2_variances(VT.none, VT.vert[0], VT.vert[1]);
 
-      if ((mb_row + y_idx >= cm->mb_rows) || (mb_col + x_idx >= cm->mb_cols)) {
-        // MB lies outside frame, move on
-        continue;
-      }
+// Set the blocksize in the macroblock info structure if the variance is less
+// than our threshold to one of none, horz, vert.
+#define set_vt_size(VT, BLOCKSIZE, R, C, ACTION) \
+  if (VT.none.variance < threshold) { \
+    set_block_size(cm, m, BLOCKSIZE, mis, R, C); \
+    ACTION; \
+  } \
+  if (VT.horz[0].variance < threshold && VT.horz[1].variance < threshold ) { \
+    set_block_size(cm, m, get_subsize(BLOCKSIZE, PARTITION_HORZ), mis, R, C); \
+    ACTION; \
+  } \
+  if (VT.vert[0].variance < threshold && VT.vert[1].variance < threshold ) { \
+    set_block_size(cm, m, get_subsize(BLOCKSIZE, PARTITION_VERT), mis, R, C); \
+    ACTION; \
+  }
 
-      set_offsets(cpi, mb_row + y_idx, mb_col + x_idx, 16);
-      xd->mb_index = i;
-      update_state(cpi, &x->mb_context[xd->sb_index][i], 16, output_enabled);
+static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
+                                int mi_col) {
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK *x = &cpi->mb;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  const int mis = cm->mode_info_stride;
+  // TODO(JBB): More experimentation or testing of this threshold;
+  int64_t threshold = 4;
+  int i, j, k;
+  v64x64 vt;
+  unsigned char * s;
+  int sp;
+  const unsigned char * d = xd->plane[0].pre->buf;
+  int dp = xd->plane[0].pre->stride;
+  int pixels_wide = 64, pixels_high = 64;
 
-      if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
-        vp9_activity_masking(cpi, x);
+  vpx_memset(&vt, 0, sizeof(vt));
 
-      encode_macroblock(cpi, tp,
-                        output_enabled, mb_row + y_idx, mb_col + x_idx);
-      if (output_enabled) {
-        update_stats(cpi, mb_row + y_idx, mb_col + x_idx);
-      }
+  set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_SB64X64);
 
-      if (output_enabled) {
-        (*tp)->Token = EOSB_TOKEN;
-       (*tp)++;
-        if (mb_row + y_idx < cm->mb_rows)
-          cpi->tplist[mb_row + y_idx].stop = *tp;
-      }
+  if (xd->mb_to_right_edge < 0)
+    pixels_wide += (xd->mb_to_right_edge >> 3);
+
+  if (xd->mb_to_bottom_edge < 0)
+    pixels_high += (xd->mb_to_bottom_edge >> 3);
+
+  s = x->plane[0].src.buf;
+  sp = x->plane[0].src.stride;
+
+  // TODO(JBB): Clearly the higher the quantizer the fewer partitions we want
+  // but this needs more experimentation.
+  threshold = threshold * cpi->common.base_qindex * cpi->common.base_qindex;
+
+  // if ( cm->frame_type == KEY_FRAME ) {
+  d = vp9_64x64_zeros;
+  dp = 64;
+  // }
+
+  // Fill in the entire tree of 8x8 variances for splits.
+  for (i = 0; i < 4; i++) {
+    const int x32_idx = ((i & 1) << 5);
+    const int y32_idx = ((i >> 1) << 5);
+    for (j = 0; j < 4; j++) {
+      const int x_idx = x32_idx + ((j & 1) << 4);
+      const int y_idx = y32_idx + ((j >> 1) << 4);
+      const uint8_t *st = s + y_idx * sp + x_idx;
+      const uint8_t *dt = d + y_idx * dp + x_idx;
+      unsigned int sse = 0;
+      int sum = 0;
+      v16x16 *vst = &vt.split[i].split[j];
+      sse = sum = 0;
+      if (x_idx < pixels_wide && y_idx < pixels_high)
+        vp9_get_sse_sum_8x8(st, sp, dt, dp, &sse, &sum);
+      fill_variance(&vst->split[0].none, sse, sum, 64);
+      sse = sum = 0;
+      if (x_idx + 8 < pixels_wide && y_idx < pixels_high)
+        vp9_get_sse_sum_8x8(st + 8, sp, dt + 8, dp, &sse, &sum);
+      fill_variance(&vst->split[1].none, sse, sum, 64);
+      sse = sum = 0;
+      if (x_idx < pixels_wide && y_idx + 8 < pixels_high)
+        vp9_get_sse_sum_8x8(st + 8 * sp, sp, dt + 8 * dp, dp, &sse, &sum);
+      fill_variance(&vst->split[2].none, sse, sum, 64);
+      sse = sum = 0;
+      if (x_idx + 8 < pixels_wide && y_idx + 8 < pixels_high)
+        vp9_get_sse_sum_8x8(st + 8 * sp + 8, sp, dt + 8 + 8 * dp, dp, &sse,
+                            &sum);
+      fill_variance(&vst->split[3].none, sse, sum, 64);
     }
   }
+  // Fill the rest of the variance tree by summing the split partition
+  // values.
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      fill_variance_tree(&vt.split[i].split[j])
+    }
+    fill_variance_tree(&vt.split[i])
+  }
+  fill_variance_tree(&vt)
 
-  // debug output
-#if DBG_PRNT_SEGMAP
-  {
-    FILE *statsfile;
-    statsfile = fopen("segmap2.stt", "a");
-    fprintf(statsfile, "\n");
-    fclose(statsfile);
+  // Now go through the entire structure,  splitting every blocksize until
+  // we get to one that's got a variance lower than our threshold,  or we
+  // hit 8x8.
+  set_vt_size( vt, BLOCK_SIZE_SB64X64, mi_row, mi_col, return);
+  for (i = 0; i < 4; ++i) {
+    const int x32_idx = ((i & 1) << 2);
+    const int y32_idx = ((i >> 1) << 2);
+    set_vt_size(vt, BLOCK_SIZE_SB32X32, mi_row + y32_idx, mi_col + x32_idx,
+                continue);
+
+    for (j = 0; j < 4; ++j) {
+      const int x16_idx = ((j & 1) << 1);
+      const int y16_idx = ((j >> 1) << 1);
+      set_vt_size(vt, BLOCK_SIZE_MB16X16, mi_row + y32_idx + y16_idx,
+                  mi_col+x32_idx+x16_idx, continue);
+
+      for (k = 0; k < 4; ++k) {
+        const int x8_idx = (k & 1);
+        const int y8_idx = (k >> 1);
+        set_block_size(cm, m, BLOCK_SIZE_SB8X8, mis,
+                       mi_row + y32_idx + y16_idx + y8_idx,
+                       mi_col + x32_idx + x16_idx + x8_idx);
+      }
+    }
   }
-#endif
 }
+static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
+                             int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize,
+                             int *rate, int *dist) {
+  VP9_COMMON * const cm = &cpi->common;
+  MACROBLOCK * const x = &cpi->mb;
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  const int mis = cm->mode_info_stride;
+  int bwl = b_width_log2(m->mbmi.sb_type);
+  int bhl = b_height_log2(m->mbmi.sb_type);
+  int bsl = b_width_log2(bsize);
+  int bh = (1 << bhl);
+  int bs = (1 << bsl);
+  int bss = (1 << bsl)/4;
+  int i, pl;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE_TYPE subsize;
+  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+  PARTITION_CONTEXT sl[8], sa[8];
+  int r = 0, d = 0;
 
-static void encode_sb64(VP9_COMP *cpi,
-                        int mb_row,
-                        int mb_col,
-                        TOKENEXTRA **tp, int is_sb[4]) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
 
-  cpi->sb64_count[is_sb[0] == 2]++;
-  if (is_sb[0] == 2) {
-    set_offsets(cpi, mb_row, mb_col, 64);
-    update_state(cpi, &x->sb64_context, 64, 1);
-    encode_superblock64(cpi, tp,
-                        1, mb_row, mb_col);
-    update_stats(cpi, mb_row, mb_col);
 
-    (*tp)->Token = EOSB_TOKEN;
-    (*tp)++;
-    if (mb_row < cm->mb_rows)
-      cpi->tplist[mb_row].stop = *tp;
-  } else {
-    int i;
+  // parse the partition type
+  if ((bwl == bsl) && (bhl == bsl))
+    partition = PARTITION_NONE;
+  else if ((bwl == bsl) && (bhl < bsl))
+    partition = PARTITION_HORZ;
+  else if ((bwl < bsl) && (bhl == bsl))
+    partition = PARTITION_VERT;
+  else if ((bwl < bsl) && (bhl < bsl))
+    partition = PARTITION_SPLIT;
+  else
+    assert(0);
 
-    for (i = 0; i < 4; i++) {
-      const int x_idx = i & 1, y_idx = i >> 1;
+  subsize = get_subsize(bsize, partition);
 
-      if (mb_row + y_idx * 2 >= cm->mb_rows ||
-          mb_col + x_idx * 2 >= cm->mb_cols) {
-        // MB lies outside frame, move on
-        continue;
+  // TODO(JBB): this restriction is here because pick_sb_modes can return
+  // r's that are INT_MAX meaning we can't select a mode / mv for this block.
+  // when the code is made to work for less than sb8x8 we need to come up with
+  // a solution to this problem.
+  assert(subsize >= BLOCK_SIZE_SB8X8);
+
+  if (bsize >= BLOCK_SIZE_SB8X8) {
+    xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
+    xd->above_seg_context = cm->above_seg_context + mi_col;
+    *(get_sb_partitioning(x, bsize)) = subsize;
+  }
+
+  pl = partition_plane_context(xd, bsize);
+  save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  switch (partition) {
+    case PARTITION_NONE:
+      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
+                    get_block_context(x, bsize));
+      r += x->partition_cost[pl][PARTITION_NONE];
+      break;
+    case PARTITION_HORZ:
+      *(get_sb_index(xd, subsize)) = 0;
+      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,
+                    get_block_context(x, subsize));
+      if (mi_row + (bh >> 1) <= cm->mi_rows) {
+        int rt, dt;
+        update_state(cpi, get_block_context(x, subsize), subsize, 0);
+        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+        *(get_sb_index(xd, subsize)) = 1;
+        pick_sb_modes(cpi, mi_row + (bs >> 2), mi_col, tp, &rt, &dt, subsize,
+                      get_block_context(x, subsize));
+        r += rt;
+        d += dt;
       }
-      xd->sb_index = i;
-      encode_sb(cpi, mb_row + 2 * y_idx, mb_col + 2 * x_idx, 1, tp,
-                is_sb[i]);
-    }
+      set_partition_seg_context(cm, xd, mi_row, mi_col);
+      pl = partition_plane_context(xd, bsize);
+      r += x->partition_cost[pl][PARTITION_HORZ];
+      break;
+    case PARTITION_VERT:
+      *(get_sb_index(xd, subsize)) = 0;
+      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,
+                    get_block_context(x, subsize));
+      if (mi_col + (bs >> 1) <= cm->mi_cols) {
+        int rt, dt;
+        update_state(cpi, get_block_context(x, subsize), subsize, 0);
+        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+        *(get_sb_index(xd, subsize)) = 1;
+        pick_sb_modes(cpi, mi_row, mi_col + (bs >> 2), tp, &rt, &dt, subsize,
+                      get_block_context(x, subsize));
+        r += rt;
+        d += dt;
+      }
+      set_partition_seg_context(cm, xd, mi_row, mi_col);
+      pl = partition_plane_context(xd, bsize);
+      r += x->partition_cost[pl][PARTITION_VERT];
+      restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+      break;
+    case PARTITION_SPLIT:
+      for (i = 0; i < 4; i++) {
+        int x_idx = (i & 1) * (bs >> 2);
+        int y_idx = (i >> 1) * (bs >> 2);
+        int jj = i >> 1, ii = i & 0x01;
+        int rt, dt;
+
+        if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+          continue;
+
+        *(get_sb_index(xd, subsize)) = i;
+
+        rd_use_partition(cpi, m + jj * bss * mis + ii * bss, tp, mi_row + y_idx,
+                         mi_col + x_idx, subsize, &rt, &dt);
+        r += rt;
+        d += dt;
+      }
+      set_partition_seg_context(cm, xd, mi_row, mi_col);
+      pl = partition_plane_context(xd, bsize);
+      r += x->partition_cost[pl][PARTITION_SPLIT];
+      break;
+    default:
+      assert(0);
   }
+
+  // update partition context
+#if CONFIG_AB4X4
+  if (bsize >= BLOCK_SIZE_SB8X8 &&
+      (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {
+#else
+  if (bsize > BLOCK_SIZE_SB8X8
+      && (bsize == BLOCK_SIZE_MB16X16 || partition != PARTITION_SPLIT)) {
+#endif
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    update_partition_context(xd, subsize, bsize);
+  }
+  restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+
+  if (r < INT_MAX && d < INT_MAX)
+    encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
+  *rate = r;
+  *dist = d;
 }
 
-static void encode_sb_row(VP9_COMP *cpi,
-                          int mb_row,
-                          TOKENEXTRA **tp,
-                          int *totalrate) {
+
+// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
+// unlikely to be selected depending on previously rate-distortion optimization
+// results, for encoding speed-up.
+static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
+                              int mi_row, int mi_col,
+                              BLOCK_SIZE_TYPE bsize,
+                              int *rate, int *dist) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  int mb_col;
+  int bsl = b_width_log2(bsize), bs = 1 << bsl;
+  int ms = bs / 2;
+  ENTROPY_CONTEXT   l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+  PARTITION_CONTEXT sl[8], sa[8];
+  TOKENEXTRA *tp_orig = *tp;
+  int i, pl;
+  BLOCK_SIZE_TYPE subsize;
+  int srate = INT_MAX, sdist = INT_MAX;
 
-  // Initialize the left context for the new SB row
-  vpx_memset(cm->left_context, 0, sizeof(cm->left_context));
+  if (bsize < BLOCK_SIZE_SB8X8)
+    if (xd->ab_index != 0) {
+      *rate = 0;
+      *dist = 0;
+      return;
+    }
+  assert(mi_height_log2(bsize) == mi_width_log2(bsize));
 
-  // Code each SB in the row
-  for (mb_col = cm->cur_tile_mb_col_start;
-       mb_col < cm->cur_tile_mb_col_end; mb_col += 4) {
-    int i;
-    int sb32_rate = 0, sb32_dist = 0;
-    int is_sb[4];
-    int sb64_rate = INT_MAX, sb64_dist;
-    int sb64_skip = 0;
-    ENTROPY_CONTEXT_PLANES l[4], a[4];
-    TOKENEXTRA *tp_orig = *tp;
+  save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
-    memcpy(&a, cm->above_context + mb_col, sizeof(a));
-    memcpy(&l, cm->left_context, sizeof(l));
-    for (i = 0; i < 4; i++) {
-      const int x_idx = (i & 1) << 1, y_idx = i & 2;
-      int mb_rate = 0, mb_dist = 0;
-      int sb_rate = INT_MAX, sb_dist;
-      int splitmodes_used = 0;
-      int sb32_skip = 0;
+  // PARTITION_SPLIT
+  if (bsize >= BLOCK_SIZE_SB8X8) {
+    int r4 = 0, d4 = 0;
+    subsize = get_subsize(bsize, PARTITION_SPLIT);
+    *(get_sb_partitioning(x, bsize)) = subsize;
 
-      if (mb_row + y_idx >= cm->mb_rows || mb_col + x_idx >= cm->mb_cols)
+    for (i = 0; i < 4; ++i) {
+      int x_idx = (i & 1) * (ms >> 1);
+      int y_idx = (i >> 1) * (ms >> 1);
+      int r = 0, d = 0;
+
+      if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
         continue;
 
-      xd->sb_index = i;
+      *(get_sb_index(xd, subsize)) = i;
+      rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+                        &r, &d);
 
-      splitmodes_used = pick_mb_modes(cpi, mb_row + y_idx, mb_col + x_idx,
-                                      tp, &mb_rate, &mb_dist);
+      r4 += r;
+      d4 += d;
+    }
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    pl = partition_plane_context(xd, bsize);
+    if (r4 < INT_MAX)
+      r4 += x->partition_cost[pl][PARTITION_SPLIT];
+    assert(r4 >= 0);
+    assert(d4 >= 0);
+    srate = r4;
+    sdist = d4;
+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
 
-      mb_rate += vp9_cost_bit(cm->sb32_coded, 0);
+  // PARTITION_HORZ
+  if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
+    int r2, d2;
+    int r = 0, d = 0;
+    subsize = get_subsize(bsize, PARTITION_HORZ);
+    *(get_sb_index(xd, subsize)) = 0;
+    pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
+                  get_block_context(x, subsize));
 
-      if (cpi->sf.splitmode_breakout) {
-        sb32_skip = splitmodes_used;
-        sb64_skip += splitmodes_used;
-      }
+    if (mi_row + (ms >> 1) < cm->mi_rows) {
+      update_state(cpi, get_block_context(x, subsize), subsize, 0);
+      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
 
-      if ( !sb32_skip &&
-           !(((cm->mb_cols & 1) && mb_col + x_idx == cm->mb_cols - 1) ||
-             ((cm->mb_rows & 1) && mb_row + y_idx == cm->mb_rows - 1))) {
-        /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
-        pick_sb_modes(cpi, mb_row + y_idx, mb_col + x_idx,
-                      tp, &sb_rate, &sb_dist);
-        sb_rate += vp9_cost_bit(cm->sb32_coded, 1);
-      }
+      *(get_sb_index(xd, subsize)) = 1;
+      pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &r, &d, subsize,
+                    get_block_context(x, subsize));
+      r2 += r;
+      d2 += d;
+    }
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    pl = partition_plane_context(xd, bsize);
+    if (r2 < INT_MAX)
+      r2 += x->partition_cost[pl][PARTITION_HORZ];
+    if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
+        RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+      srate = r2;
+      sdist = d2;
+      *(get_sb_partitioning(x, bsize)) = subsize;
+    }
+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
 
-      /* Decide whether to encode as a SB or 4xMBs */
-      if (sb_rate < INT_MAX &&
-          RDCOST(x->rdmult, x->rddiv, sb_rate, sb_dist) <
-              RDCOST(x->rdmult, x->rddiv, mb_rate, mb_dist)) {
-        is_sb[i] = 1;
-        sb32_rate += sb_rate;
-        sb32_dist += sb_dist;
-      } else {
-        is_sb[i] = 0;
-        sb32_rate += mb_rate;
-        sb32_dist += mb_dist;
+  // PARTITION_VERT
+  if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) {
+    int r2, d2;
+    subsize = get_subsize(bsize, PARTITION_VERT);
+    *(get_sb_index(xd, subsize)) = 0;
+    pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
+                  get_block_context(x, subsize));
+    if (mi_col + (ms >> 1) < cm->mi_cols) {
+      int r = 0, d = 0;
+      update_state(cpi, get_block_context(x, subsize), subsize, 0);
+      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
 
-        // If we used 16x16 instead of 32x32 then skip 64x64 (if enabled).
-        if (cpi->sf.mb16_breakout) {
-          ++sb64_skip;
-        }
-      }
-
-      /* Encode SB using best computed mode(s) */
-      // FIXME(rbultje): there really shouldn't be any need to encode_mb/sb
-      // for each level that we go up, we can just keep tokens and recon
-      // pixels of the lower level; also, inverting SB/MB order (big->small
-      // instead of small->big) means we can use as threshold for small, which
-      // may enable breakouts if RD is not good enough (i.e. faster)
-      encode_sb(cpi, mb_row + y_idx, mb_col + x_idx, 0, tp, is_sb[i]);
+      *(get_sb_index(xd, subsize)) = 1;
+      pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &r, &d, subsize,
+                    get_block_context(x, subsize));
+      r2 += r;
+      d2 += d;
     }
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    pl = partition_plane_context(xd, bsize);
+    if (r2 < INT_MAX)
+      r2 += x->partition_cost[pl][PARTITION_VERT];
+    if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
+        RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+      srate = r2;
+      sdist = d2;
+      *(get_sb_partitioning(x, bsize)) = subsize;
+    }
+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
 
-    memcpy(cm->above_context + mb_col, &a, sizeof(a));
-    memcpy(cm->left_context, &l, sizeof(l));
-    sb32_rate += vp9_cost_bit(cm->sb64_coded, 0);
-
-    if (!sb64_skip &&
-        !(((cm->mb_cols & 3) && mb_col + 3 >= cm->mb_cols) ||
-          ((cm->mb_rows & 3) && mb_row + 3 >= cm->mb_rows))) {
-      pick_sb64_modes(cpi, mb_row, mb_col, tp, &sb64_rate, &sb64_dist);
-      sb64_rate += vp9_cost_bit(cm->sb64_coded, 1);
+  // PARTITION_NONE
+  if ((mi_row + (ms >> 1) < cm->mi_rows) &&
+      (mi_col + (ms >> 1) < cm->mi_cols)) {
+    int r, d;
+    pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
+                  get_block_context(x, bsize));
+    if (bsize >= BLOCK_SIZE_SB8X8) {
+      set_partition_seg_context(cm, xd, mi_row, mi_col);
+      pl = partition_plane_context(xd, bsize);
+      r += x->partition_cost[pl][PARTITION_NONE];
     }
 
-    /* Decide whether to encode as a SB or 4xMBs */
-    if (sb64_rate < INT_MAX &&
-        RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist) <
-            RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
-      is_sb[0] = 2;
-      *totalrate += sb64_rate;
-    } else {
-      *totalrate += sb32_rate;
+    if (RDCOST(x->rdmult, x->rddiv, r, d) <
+        RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+      srate = r;
+      sdist = d;
+      if (bsize >= BLOCK_SIZE_SB8X8)
+        *(get_sb_partitioning(x, bsize)) = bsize;
     }
+  }
 
-    assert(tp_orig == *tp);
-    encode_sb64(cpi, mb_row, mb_col, tp, is_sb);
+  *rate = srate;
+  *dist = sdist;
+
+  restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+
+  if (srate < INT_MAX && sdist < INT_MAX)
+    encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
+
+  if (bsize == BLOCK_SIZE_SB64X64) {
     assert(tp_orig < *tp);
+    assert(srate < INT_MAX);
+    assert(sdist < INT_MAX);
+  } else {
+    assert(tp_orig == *tp);
   }
 }
 
+static void encode_sb_row(VP9_COMP *cpi, int mi_row,
+                       TOKENEXTRA **tp, int *totalrate) {
+  VP9_COMMON *const cm = &cpi->common;
+  int mi_col;
+
+  // Initialize the left context for the new SB row
+  vpx_memset(&cm->left_context, 0, sizeof(cm->left_context));
+  vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context));
+
+  // Code each SB in the row
+  for (mi_col = cm->cur_tile_mi_col_start;
+       mi_col < cm->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE) {
+    int dummy_rate, dummy_dist;
+    if (cpi->speed < 5) {
+      rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+                        &dummy_rate, &dummy_dist);
+    } else {
+      const int idx_str = cm->mode_info_stride * mi_row + mi_col;
+      MODE_INFO *m = cm->mi + idx_str;
+      // set_partitioning(cpi, m, BLOCK_SIZE_SB64X64);
+      choose_partitioning(cpi, cm->mi, mi_row, mi_col);
+      rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+                       &dummy_rate, &dummy_dist);
+    }
+  }
+}
+
 static void init_encode_frame_mb_context(VP9_COMP *cpi) {
   MACROBLOCK *const x = &cpi->mb;
   VP9_COMMON *const cm = &cpi->common;
@@ -1163,7 +1421,6 @@
 
   x->act_zbin_adj = 0;
   cpi->seg0_idx = 0;
-  vpx_memset(cpi->ref_pred_count, 0, sizeof(cpi->ref_pred_count));
 
   xd->mode_info_stride = cm->mode_info_stride;
   xd->frame_type = cm->frame_type;
@@ -1176,42 +1433,39 @@
     vp9_init_mbmode_probs(cm);
 
   // Copy data over into macro block data structures.
-  x->src = *cpi->Source;
-  xd->pre = cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]];
-  xd->dst = cm->yv12_fb[cm->new_fb_idx];
+  vp9_setup_src_planes(x, cpi->Source, 0, 0);
 
-  // set up frame for intra coded blocks
-  vp9_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);
+  // TODO(jkoleszar): are these initializations required?
+  setup_pre_planes(xd, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]], NULL,
+                   0, 0, NULL, NULL);
+  setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0);
 
   vp9_build_block_offsets(x);
 
-  vp9_setup_block_dptrs(&x->e_mbd);
+  vp9_setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
-  vp9_setup_block_ptrs(x);
-
   xd->mode_info_context->mbmi.mode = DC_PRED;
   xd->mode_info_context->mbmi.uv_mode = DC_PRED;
 
-  vp9_zero(cpi->count_mb_ref_frame_usage)
-  vp9_zero(cpi->bmode_count)
-  vp9_zero(cpi->ymode_count)
-  vp9_zero(cpi->i8x8_mode_count)
+  vp9_zero(cpi->y_mode_count)
   vp9_zero(cpi->y_uv_mode_count)
-  vp9_zero(cpi->sub_mv_ref_count)
-  vp9_zero(cpi->mbsplit_count)
-  vp9_zero(cpi->common.fc.mv_ref_ct)
-  vp9_zero(cpi->sb_ymode_count)
-  vp9_zero(cpi->sb32_count);
-  vp9_zero(cpi->sb64_count);
-#if CONFIG_COMP_INTERINTRA_PRED
-  vp9_zero(cpi->interintra_count);
-  vp9_zero(cpi->interintra_select_count);
-#endif
+  vp9_zero(cm->fc.inter_mode_counts)
+  vp9_zero(cpi->partition_count);
+  vp9_zero(cpi->intra_inter_count);
+  vp9_zero(cpi->comp_inter_count);
+  vp9_zero(cpi->single_ref_count);
+  vp9_zero(cpi->comp_ref_count);
+  vp9_zero(cm->fc.tx_count_32x32p);
+  vp9_zero(cm->fc.tx_count_16x16p);
+  vp9_zero(cm->fc.tx_count_8x8p);
+  vp9_zero(cm->fc.mbskip_count);
 
-  vpx_memset(cm->above_context, 0,
-             sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
-
-  xd->fullpixel_mask = cm->full_pixel ? 0xfffffff8 : 0xffffffff;
+  // Note: this memset assumes above_context[0], [1] and [2]
+  // are allocated as part of the same buffer.
+  vpx_memset(cm->above_context[0], 0, sizeof(ENTROPY_CONTEXT) * 2 *
+                                      MAX_MB_PLANE * mi_cols_aligned_to_sb(cm));
+  vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
+                                       mi_cols_aligned_to_sb(cm));
 }
 
 static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
@@ -1218,37 +1472,32 @@
   if (lossless) {
     cpi->mb.fwd_txm8x4            = vp9_short_walsh8x4;
     cpi->mb.fwd_txm4x4            = vp9_short_walsh4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_iwalsh4x4_1;
-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_iwalsh4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_iwalsh4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_iwalsh4x4_add;
     cpi->mb.optimize              = 0;
     cpi->common.filter_level      = 0;
-    cpi->zbin_mode_boost_enabled  = FALSE;
+    cpi->zbin_mode_boost_enabled  = 0;
     cpi->common.txfm_mode         = ONLY_4X4;
   } else {
     cpi->mb.fwd_txm8x4            = vp9_short_fdct8x4;
     cpi->mb.fwd_txm4x4            = vp9_short_fdct4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4_1;
-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_idct4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_idct4x4_add;
   }
 }
 
 
 static void encode_frame_internal(VP9_COMP *cpi) {
-  int mb_row;
+  int mi_row;
   MACROBLOCK *const x = &cpi->mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   int totalrate;
 
-//   fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",
-//            cpi->common.current_video_frame, cpi->common.show_frame,
-//            cm->frame_type);
+//  fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",
+//           cpi->common.current_video_frame, cpi->common.show_frame,
+//           cm->frame_type);
 
-  // Compute a modified set of reference frame probabilities to use when
-  // prediction fails. These are based on the current general estimates for
-  // this frame which may be updated with each iteration of the recode loop.
-  vp9_compute_mod_refprobs(cm);
-
 // debug output
 #if DBG_PRNT_SEGMAP
   {
@@ -1264,10 +1513,7 @@
   // Reset frame count of inter 0,0 motion vector usage.
   cpi->inter_zz_count = 0;
 
-  cpi->skip_true_count[0] = cpi->skip_true_count[1] = cpi->skip_true_count[2] = 0;
-  cpi->skip_false_count[0] = cpi->skip_false_count[1] = cpi->skip_false_count[2] = 0;
-
-  vp9_zero(cpi->switchable_interp_count);
+  vp9_zero(cm->fc.switchable_interp_count);
   vp9_zero(cpi->best_switchable_interp_count);
 
   xd->mode_info_context = cm->mi;
@@ -1274,31 +1520,18 @@
   xd->prev_mode_info_context = cm->prev_mi;
 
   vp9_zero(cpi->NMVcount);
-  vp9_zero(cpi->coef_counts_4x4);
-  vp9_zero(cpi->coef_counts_8x8);
-  vp9_zero(cpi->coef_counts_16x16);
-  vp9_zero(cpi->coef_counts_32x32);
+  vp9_zero(cpi->coef_counts);
   vp9_zero(cm->fc.eob_branch_counts);
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_zero(cm->fc.nzc_counts_4x4);
-  vp9_zero(cm->fc.nzc_counts_8x8);
-  vp9_zero(cm->fc.nzc_counts_16x16);
-  vp9_zero(cm->fc.nzc_counts_32x32);
-  vp9_zero(cm->fc.nzc_pcat_counts);
-#endif
-#if CONFIG_NEW_MVREF
-  vp9_zero(cpi->mb_mv_ref_count);
-#endif
 
-  cpi->mb.e_mbd.lossless = (cm->base_qindex == 0 &&
-                            cm->y1dc_delta_q == 0 &&
-                            cm->uvdc_delta_q == 0 &&
-                            cm->uvac_delta_q == 0);
+  cpi->mb.e_mbd.lossless = cm->base_qindex == 0 &&
+                           cm->y_dc_delta_q == 0 &&
+                           cm->uv_dc_delta_q == 0 &&
+                           cm->uv_ac_delta_q == 0;
   switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless);
 
   vp9_frame_init_quantizer(cpi);
 
-  vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);
+  vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y_dc_delta_q);
   vp9_initialize_me_consts(cpi, cm->base_qindex);
 
   if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
@@ -1313,12 +1546,11 @@
   init_encode_frame_mb_context(cpi);
 
   vpx_memset(cpi->rd_comp_pred_diff, 0, sizeof(cpi->rd_comp_pred_diff));
-  vpx_memset(cpi->single_pred_count, 0, sizeof(cpi->single_pred_count));
-  vpx_memset(cpi->comp_pred_count, 0, sizeof(cpi->comp_pred_count));
-  vpx_memset(cpi->txfm_count_32x32p, 0, sizeof(cpi->txfm_count_32x32p));
-  vpx_memset(cpi->txfm_count_16x16p, 0, sizeof(cpi->txfm_count_16x16p));
-  vpx_memset(cpi->txfm_count_8x8p, 0, sizeof(cpi->txfm_count_8x8p));
   vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff));
+  vpx_memset(cpi->rd_tx_select_threshes, 0, sizeof(cpi->rd_tx_select_threshes));
+
+  set_prev_mi(cm);
+
   {
     struct vpx_usec_timer  emr_timer;
     vpx_usec_timer_start(&emr_timer);
@@ -1336,11 +1568,13 @@
 
           // For each row of SBs in the frame
           vp9_get_tile_col_offsets(cm, tile_col);
-          for (mb_row = cm->cur_tile_mb_row_start;
-               mb_row < cm->cur_tile_mb_row_end; mb_row += 4) {
-            encode_sb_row(cpi, mb_row, &tp, &totalrate);
-          }
+          for (mi_row = cm->cur_tile_mi_row_start;
+               mi_row < cm->cur_tile_mi_row_end;
+               mi_row += 8)
+            encode_sb_row(cpi, mi_row, &tp, &totalrate);
           cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);
+          assert(tp - cpi->tok <=
+                 get_token_alloc(cm->mb_rows, cm->mb_cols));
         }
       }
     }
@@ -1365,15 +1599,6 @@
   int ref_flags = cpi->ref_frame_flags;
 
   if (vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {
-    if ((ref_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) == (VP9_LAST_FLAG | VP9_GOLD_FLAG) &&
-        vp9_check_segref(xd, 1, LAST_FRAME))
-      return 1;
-    if ((ref_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) == (VP9_GOLD_FLAG | VP9_ALT_FLAG) &&
-        vp9_check_segref(xd, 1, GOLDEN_FRAME))
-      return 1;
-    if ((ref_flags & (VP9_ALT_FLAG  | VP9_LAST_FLAG)) == (VP9_ALT_FLAG  | VP9_LAST_FLAG) &&
-        vp9_check_segref(xd, 1, ALTREF_FRAME))
-      return 1;
     return 0;
   } else {
     return (!!(ref_flags & VP9_GOLD_FLAG) +
@@ -1382,23 +1607,6 @@
   }
 }
 
-static void reset_skip_txfm_size_mb(VP9_COMP *cpi,
-                                    MODE_INFO *mi, TX_SIZE txfm_max) {
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-
-  if (mbmi->txfm_size > txfm_max) {
-    VP9_COMMON *const cm = &cpi->common;
-    MACROBLOCK *const x = &cpi->mb;
-    MACROBLOCKD *const xd = &x->e_mbd;
-    const int segment_id = mbmi->segment_id;
-
-    xd->mode_info_context = mi;
-    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) ||
-           (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff));
-    mbmi->txfm_size = txfm_max;
-  }
-}
-
 static int get_skip_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs) {
   int x, y;
 
@@ -1422,96 +1630,120 @@
   }
 }
 
-static void reset_skip_txfm_size_sb32(VP9_COMP *cpi, MODE_INFO *mi,
-                                      int mis, TX_SIZE txfm_max,
-                                      int mb_rows_left, int mb_cols_left) {
+static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi,
+                                   int mis, TX_SIZE txfm_max,
+                                   int bw, int bh, int mi_row, int mi_col,
+                                   BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
 
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
   if (mbmi->txfm_size > txfm_max) {
-    VP9_COMMON *const cm = &cpi->common;
     MACROBLOCK *const x = &cpi->mb;
     MACROBLOCKD *const xd = &x->e_mbd;
     const int segment_id = mbmi->segment_id;
-    const int ymbs = MIN(2, mb_rows_left);
-    const int xmbs = MIN(2, mb_cols_left);
+    const int ymbs = MIN(bh, cm->mi_rows - mi_row);
+    const int xmbs = MIN(bw, cm->mi_cols - mi_col);
 
     xd->mode_info_context = mi;
-    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) ||
-           (cm->mb_no_coeff_skip && get_skip_flag(mi, mis, ymbs, xmbs)));
+    assert(vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) ||
+           get_skip_flag(mi, mis, ymbs, xmbs));
     set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
   }
 }
 
-static void reset_skip_txfm_size_sb64(VP9_COMP *cpi, MODE_INFO *mi,
-                                      int mis, TX_SIZE txfm_max,
-                                      int mb_rows_left, int mb_cols_left) {
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
+                                    TX_SIZE txfm_max,
+                                    int mi_row, int mi_col,
+                                    BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int mis = cm->mode_info_stride;
+  int bwl, bhl;
+  const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);
 
-  if (mbmi->txfm_size > txfm_max) {
-    VP9_COMMON *const cm = &cpi->common;
-    MACROBLOCK *const x = &cpi->mb;
-    MACROBLOCKD *const xd = &x->e_mbd;
-    const int segment_id = mbmi->segment_id;
-    const int ymbs = MIN(4, mb_rows_left);
-    const int xmbs = MIN(4, mb_cols_left);
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
 
-    xd->mode_info_context = mi;
-    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) ||
-           (cm->mb_no_coeff_skip && get_skip_flag(mi, mis, ymbs, xmbs)));
-    set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
+  bwl = mi_width_log2(mi->mbmi.sb_type);
+  bhl = mi_height_log2(mi->mbmi.sb_type);
+
+  if (bwl == bsl && bhl == bsl) {
+    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl,
+                           mi_row, mi_col, bsize);
+  } else if (bwl == bsl && bhl < bsl) {
+    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs,
+                           mi_row, mi_col, bsize);
+    reset_skip_txfm_size_b(cpi, mi + bs * mis, mis, txfm_max, 1 << bsl, bs,
+                           mi_row + bs, mi_col, bsize);
+  } else if (bwl < bsl && bhl == bsl) {
+    reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl,
+                           mi_row, mi_col, bsize);
+    reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl,
+                           mi_row, mi_col + bs, bsize);
+  } else {
+    BLOCK_SIZE_TYPE subsize;
+    int n;
+
+    assert(bwl < bsl && bhl < bsl);
+    if (bsize == BLOCK_SIZE_SB64X64) {
+      subsize = BLOCK_SIZE_SB32X32;
+    } else if (bsize == BLOCK_SIZE_SB32X32) {
+      subsize = BLOCK_SIZE_MB16X16;
+    } else {
+      assert(bsize == BLOCK_SIZE_MB16X16);
+      subsize = BLOCK_SIZE_SB8X8;
+    }
+
+    for (n = 0; n < 4; n++) {
+      const int y_idx = n >> 1, x_idx = n & 0x01;
+
+      reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs,
+                              txfm_max, mi_row + y_idx * bs,
+                              mi_col + x_idx * bs, subsize);
+    }
   }
 }
 
 static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
   VP9_COMMON *const cm = &cpi->common;
-  int mb_row, mb_col;
+  int mi_row, mi_col;
   const int mis = cm->mode_info_stride;
   MODE_INFO *mi, *mi_ptr = cm->mi;
 
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4, mi_ptr += 4 * mis) {
+  for (mi_row = 0; mi_row < cm->mi_rows;
+       mi_row += 8, mi_ptr += 8 * mis) {
     mi = mi_ptr;
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 4, mi += 4) {
-      if (mi->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
-        reset_skip_txfm_size_sb64(cpi, mi, mis, txfm_max,
-                                  cm->mb_rows - mb_row, cm->mb_cols - mb_col);
-      } else {
-        int i;
-
-        for (i = 0; i < 4; i++) {
-          const int x_idx_sb = (i & 1) << 1, y_idx_sb = i & 2;
-          MODE_INFO *sb_mi = mi + y_idx_sb * mis + x_idx_sb;
-
-          if (mb_row + y_idx_sb >= cm->mb_rows ||
-              mb_col + x_idx_sb >= cm->mb_cols)
-            continue;
-
-          if (sb_mi->mbmi.sb_type) {
-            reset_skip_txfm_size_sb32(cpi, sb_mi, mis, txfm_max,
-                                      cm->mb_rows - mb_row - y_idx_sb,
-                                      cm->mb_cols - mb_col - x_idx_sb);
-          } else {
-            int m;
-
-            for (m = 0; m < 4; m++) {
-              const int x_idx = x_idx_sb + (m & 1), y_idx = y_idx_sb + (m >> 1);
-              MODE_INFO *mb_mi;
-
-              if (mb_col + x_idx >= cm->mb_cols ||
-                  mb_row + y_idx >= cm->mb_rows)
-                continue;
-
-              mb_mi = mi + y_idx * mis + x_idx;
-              assert(mb_mi->mbmi.sb_type == BLOCK_SIZE_MB16X16);
-              reset_skip_txfm_size_mb(cpi, mb_mi, txfm_max);
-            }
-          }
-        }
-      }
+    for (mi_col = 0; mi_col < cm->mi_cols;
+         mi_col += 8, mi += 8) {
+      reset_skip_txfm_size_sb(cpi, mi, txfm_max,
+                              mi_row, mi_col, BLOCK_SIZE_SB64X64);
     }
   }
 }
 
 void vp9_encode_frame(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  // In the longer term the encoder should be generalized to match the
+  // decoder such that we allow compound where one of the 3 buffers has a
+  // differnt sign bias and that buffer is then the fixed ref. However, this
+  // requires further work in the rd loop. For now the only supported encoder
+  // side behaviour is where the ALT ref buffer has oppositie sign bias to
+  // the other two.
+  if ((cm->ref_frame_sign_bias[ALTREF_FRAME] ==
+       cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
+      (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
+       cm->ref_frame_sign_bias[LAST_FRAME])) {
+    cm->allow_comp_inter_inter = 0;
+  } else {
+    cm->allow_comp_inter_inter = 1;
+    cm->comp_fixed_ref = ALTREF_FRAME;
+    cm->comp_var_ref[0] = LAST_FRAME;
+    cm->comp_var_ref[1] = GOLDEN_FRAME;
+  }
+
   if (cpi->sf.RD) {
     int i, frame_type, pred_type;
     TXFM_MODE txfm_type;
@@ -1535,7 +1767,7 @@
       frame_type = 2;
 
     /* prediction (compound, single or hybrid) mode selection */
-    if (frame_type == 3)
+    if (frame_type == 3 || !cm->allow_comp_inter_inter)
       pred_type = SINGLE_PREDICTION_ONLY;
     else if (cpi->rd_prediction_type_threshes[frame_type][1] >
                  cpi->rd_prediction_type_threshes[frame_type][0] &&
@@ -1584,15 +1816,11 @@
     } else
       txfm_type = ALLOW_8X8;
 #else
-    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] >=
+    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] >
                   cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
                     ALLOW_32X32 : TX_MODE_SELECT;
 #endif
     cpi->common.txfm_mode = txfm_type;
-    if (txfm_type != TX_MODE_SELECT) {
-      cpi->common.prob_tx[0] = 128;
-      cpi->common.prob_tx[1] = 128;
-    }
     cpi->common.comp_pred_mode = pred_type;
     encode_frame_internal(cpi);
 
@@ -1617,29 +1845,50 @@
       int single_count_zero = 0;
       int comp_count_zero = 0;
 
-      for (i = 0; i < COMP_PRED_CONTEXTS; i++) {
-        single_count_zero += cpi->single_pred_count[i];
-        comp_count_zero += cpi->comp_pred_count[i];
+      for (i = 0; i < COMP_INTER_CONTEXTS; i++) {
+        single_count_zero += cpi->comp_inter_count[i][0];
+        comp_count_zero += cpi->comp_inter_count[i][1];
       }
 
       if (comp_count_zero == 0) {
         cpi->common.comp_pred_mode = SINGLE_PREDICTION_ONLY;
+        vp9_zero(cpi->comp_inter_count);
       } else if (single_count_zero == 0) {
         cpi->common.comp_pred_mode = COMP_PREDICTION_ONLY;
+        vp9_zero(cpi->comp_inter_count);
       }
     }
 
     if (cpi->common.txfm_mode == TX_MODE_SELECT) {
-      const int count4x4 = cpi->txfm_count_16x16p[TX_4X4] +
-                           cpi->txfm_count_32x32p[TX_4X4] +
-                           cpi->txfm_count_8x8p[TX_4X4];
-      const int count8x8_lp = cpi->txfm_count_32x32p[TX_8X8] +
-                              cpi->txfm_count_16x16p[TX_8X8];
-      const int count8x8_8x8p = cpi->txfm_count_8x8p[TX_8X8];
-      const int count16x16_16x16p = cpi->txfm_count_16x16p[TX_16X16];
-      const int count16x16_lp = cpi->txfm_count_32x32p[TX_16X16];
-      const int count32x32 = cpi->txfm_count_32x32p[TX_32X32];
+      int count4x4 = 0;
+      int count8x8_lp = 0, count8x8_8x8p = 0;
+      int count16x16_16x16p = 0, count16x16_lp = 0;
+      int count32x32 = 0;
 
+      for (i = 0; i < TX_SIZE_CONTEXTS; i++)
+        count4x4 += cm->fc.tx_count_32x32p[i][TX_4X4];
+      for (i = 0; i < TX_SIZE_CONTEXTS; i++)
+        count4x4 += cm->fc.tx_count_16x16p[i][TX_4X4];
+      for (i = 0; i < TX_SIZE_CONTEXTS; i++)
+        count4x4 += cm->fc.tx_count_8x8p[i][TX_4X4];
+
+      for (i = 0; i < TX_SIZE_CONTEXTS; i++)
+        count8x8_lp += cm->fc.tx_count_32x32p[i][TX_8X8];
+      for (i = 0; i < TX_SIZE_CONTEXTS; i++)
+        count8x8_lp += cm->fc.tx_count_16x16p[i][TX_8X8];
+
+      for (i = 0; i < TX_SIZE_CONTEXTS; i++)
+        count8x8_8x8p += cm->fc.tx_count_8x8p[i][TX_8X8];
+
+      for (i = 0; i < TX_SIZE_CONTEXTS; i++)
+        count16x16_16x16p += cm->fc.tx_count_16x16p[i][TX_16X16];
+
+      for (i = 0; i < TX_SIZE_CONTEXTS; i++)
+        count16x16_lp += cm->fc.tx_count_32x32p[i][TX_16X16];
+
+      for (i = 0; i < TX_SIZE_CONTEXTS; i++)
+        count32x32 += cm->fc.tx_count_32x32p[i][TX_32X32];
+
       if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
           count32x32 == 0) {
         cpi->common.txfm_mode = ALLOW_8X8;
@@ -1665,70 +1914,7 @@
 
 }
 
-void vp9_setup_block_ptrs(MACROBLOCK *x) {
-  int r, c;
-  int i;
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++)
-      x->block[r * 4 + c].src_diff = x->src_diff + r * 4 * 16 + c * 4;
-  }
-
-  for (r = 0; r < 2; r++) {
-    for (c = 0; c < 2; c++)
-      x->block[16 + r * 2 + c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4;
-  }
-
-
-  for (r = 0; r < 2; r++) {
-    for (c = 0; c < 2; c++)
-      x->block[20 + r * 2 + c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4;
-  }
-
-  for (i = 0; i < 24; i++)
-    x->block[i].coeff = x->coeff + i * 16;
-}
-
 void vp9_build_block_offsets(MACROBLOCK *x) {
-  int block = 0;
-  int br, bc;
-
-  vp9_build_block_doffsets(&x->e_mbd);
-
-  for (br = 0; br < 4; br++) {
-    for (bc = 0; bc < 4; bc++) {
-      BLOCK *this_block = &x->block[block];
-      // this_block->base_src = &x->src.y_buffer;
-      // this_block->src_stride = x->src.y_stride;
-      // this_block->src = 4 * br * this_block->src_stride + 4 * bc;
-      this_block->base_src = &x->src.y_buffer;
-      this_block->src_stride = x->src.y_stride;
-      this_block->src = 4 * br * this_block->src_stride + 4 * bc;
-      ++block;
-    }
-  }
-
-  // u blocks
-  for (br = 0; br < 2; br++) {
-    for (bc = 0; bc < 2; bc++) {
-      BLOCK *this_block = &x->block[block];
-      this_block->base_src = &x->src.u_buffer;
-      this_block->src_stride = x->src.uv_stride;
-      this_block->src = 4 * br * this_block->src_stride + 4 * bc;
-      ++block;
-    }
-  }
-
-  // v blocks
-  for (br = 0; br < 2; br++) {
-    for (bc = 0; bc < 2; bc++) {
-      BLOCK *this_block = &x->block[block];
-      this_block->base_src = &x->src.v_buffer;
-      this_block->src_stride = x->src.uv_stride;
-      this_block->src = 4 * br * this_block->src_stride + 4 * bc;
-      ++block;
-    }
-  }
 }
 
 static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) {
@@ -1736,53 +1922,23 @@
   const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;
   const MB_PREDICTION_MODE uvm = xd->mode_info_context->mbmi.uv_mode;
 
-#ifdef MODE_STATS
-  const int is_key = cpi->common.frame_type == KEY_FRAME;
-
-  ++ (is_key ? uv_modes : inter_uv_modes)[uvm];
-  ++ uv_modes_y[m][uvm];
-
-  if (m == B_PRED) {
-    unsigned int *const bct = is_key ? b_modes : inter_b_modes;
-
-    int b = 0;
-
-    do {
-      ++ bct[xd->block[b].bmi.as_mode.first];
-    } while (++b < 16);
-  }
-
-  if (m == I8X8_PRED) {
-    i8x8_modes[xd->block[0].bmi.as_mode.first]++;
-    i8x8_modes[xd->block[2].bmi.as_mode.first]++;
-    i8x8_modes[xd->block[8].bmi.as_mode.first]++;
-    i8x8_modes[xd->block[10].bmi.as_mode.first]++;
-  }
-#endif
-
-  if (xd->mode_info_context->mbmi.sb_type) {
-    ++cpi->sb_ymode_count[m];
+  ++cpi->y_uv_mode_count[m][uvm];
+  if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
+    const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+    const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
+    const int bsl = MIN(bwl, bhl);
+    ++cpi->y_mode_count[MIN(bsl, 3)][m];
   } else {
-    ++cpi->ymode_count[m];
+    int idx, idy;
+    int bw = 1 << b_width_log2(xd->mode_info_context->mbmi.sb_type);
+    int bh = 1 << b_height_log2(xd->mode_info_context->mbmi.sb_type);
+    for (idy = 0; idy < 2; idy += bh) {
+      for (idx = 0; idx < 2; idx += bw) {
+        int m = xd->mode_info_context->bmi[idy * 2 + idx].as_mode.first;
+        ++cpi->y_mode_count[0][m];
+      }
+    }
   }
-  if (m != I8X8_PRED)
-    ++cpi->y_uv_mode_count[m][uvm];
-  else {
-    cpi->i8x8_mode_count[xd->block[0].bmi.as_mode.first]++;
-    cpi->i8x8_mode_count[xd->block[2].bmi.as_mode.first]++;
-    cpi->i8x8_mode_count[xd->block[8].bmi.as_mode.first]++;
-    cpi->i8x8_mode_count[xd->block[10].bmi.as_mode.first]++;
-  }
-  if (m == B_PRED) {
-    int b = 0;
-    do {
-      int m = xd->block[b].bmi.as_mode.first;
-#if CONFIG_NEWBINTRAMODES
-      if (m == B_CONTEXT_PRED) m -= CONTEXT_PRED_REPLACEMENTS;
-#endif
-      ++cpi->bmode_count[m];
-    } while (++b < 16);
-  }
 }
 
 // Experimental stub function to create a per MB zbin adjustment based on
@@ -1806,268 +1962,22 @@
 #endif
 }
 
-static void update_sb64_skip_coeff_state(VP9_COMP *cpi,
-                                         ENTROPY_CONTEXT_PLANES ta[16],
-                                         ENTROPY_CONTEXT_PLANES tl[16],
-                                         TOKENEXTRA *t[16],
-                                         TOKENEXTRA **tp,
-                                         int skip[16], int output_enabled) {
-  MACROBLOCK *const x = &cpi->mb;
-
-  if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_32X32) {
-    TOKENEXTRA tokens[4][1024+512];
-    int n_tokens[4], n;
-
-    // if there were no skips, we don't need to do anything
-    if (!skip[0] && !skip[1] && !skip[2] && !skip[3])
-      return;
-
-    // if we don't do coeff skipping for this frame, we don't
-    // need to do anything here
-    if (!cpi->common.mb_no_coeff_skip)
-      return;
-
-    // if all 4 MBs skipped coeff coding, nothing to be done
-    if (skip[0] && skip[1] && skip[2] && skip[3])
-      return;
-
-    // so the situation now is that we want to skip coeffs
-    // for some MBs, but not all, and we didn't code EOB
-    // coefficients for them. However, the skip flag for this
-    // SB will be 0 overall, so we need to insert EOBs in the
-    // middle of the token tree. Do so here.
-    for (n = 0; n < 4; n++) {
-      if (n < 3) {
-        n_tokens[n] = t[n + 1] - t[n];
-      } else {
-        n_tokens[n] = *tp - t[3];
-      }
-      if (n_tokens[n]) {
-        memcpy(tokens[n], t[n], n_tokens[n] * sizeof(*t[0]));
-      }
-    }
-
-    // reset pointer, stuff EOBs where necessary
-    *tp = t[0];
-    for (n = 0; n < 4; n++) {
-      if (skip[n]) {
-        x->e_mbd.above_context = &ta[n * 2];
-        x->e_mbd.left_context  = &tl[n * 2];
-        vp9_stuff_sb(cpi, &x->e_mbd, tp, !output_enabled);
-      } else {
-        if (n_tokens[n]) {
-          memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
-        }
-        (*tp) += n_tokens[n];
-      }
-    }
-  } else {
-    TOKENEXTRA tokens[16][16 * 25];
-    int n_tokens[16], n;
-
-    // if there were no skips, we don't need to do anything
-    if (!skip[ 0] && !skip[ 1] && !skip[ 2] && !skip[ 3] &&
-        !skip[ 4] && !skip[ 5] && !skip[ 6] && !skip[ 7] &&
-        !skip[ 8] && !skip[ 9] && !skip[10] && !skip[11] &&
-        !skip[12] && !skip[13] && !skip[14] && !skip[15])
-      return;
-
-    // if we don't do coeff skipping for this frame, we don't
-    // need to do anything here
-    if (!cpi->common.mb_no_coeff_skip)
-      return;
-
-    // if all 4 MBs skipped coeff coding, nothing to be done
-    if (skip[ 0] && skip[ 1] && skip[ 2] && skip[ 3] &&
-        skip[ 4] && skip[ 5] && skip[ 6] && skip[ 7] &&
-        skip[ 8] && skip[ 9] && skip[10] && skip[11] &&
-        skip[12] && skip[13] && skip[14] && skip[15])
-      return;
-
-    // so the situation now is that we want to skip coeffs
-    // for some MBs, but not all, and we didn't code EOB
-    // coefficients for them. However, the skip flag for this
-    // SB will be 0 overall, so we need to insert EOBs in the
-    // middle of the token tree. Do so here.
-    for (n = 0; n < 16; n++) {
-      if (n < 15) {
-        n_tokens[n] = t[n + 1] - t[n];
-      } else {
-        n_tokens[n] = *tp - t[15];
-      }
-      if (n_tokens[n]) {
-        memcpy(tokens[n], t[n], n_tokens[n] * sizeof(*t[0]));
-      }
-    }
-
-    // reset pointer, stuff EOBs where necessary
-    *tp = t[0];
-    for (n = 0; n < 16; n++) {
-      if (skip[n]) {
-        x->e_mbd.above_context = &ta[n];
-        x->e_mbd.left_context  = &tl[n];
-        vp9_stuff_mb(cpi, &x->e_mbd, tp, !output_enabled);
-      } else {
-        if (n_tokens[n]) {
-          memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
-        }
-        (*tp) += n_tokens[n];
-      }
-    }
-  }
-}
-
-#if CONFIG_CODE_NONZEROCOUNT
-static void gather_nzcs_mb16(VP9_COMMON *const cm,
-                             MACROBLOCKD *xd) {
-  int i;
-  vpx_memset(xd->mode_info_context->mbmi.nzcs, 0,
-             384 * sizeof(xd->mode_info_context->mbmi.nzcs[0]));
-  switch (xd->mode_info_context->mbmi.txfm_size) {
-    case TX_4X4:
-      for (i = 0; i < 24; ++i) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      break;
-
-    case TX_8X8:
-      for (i = 0; i < 16; i += 4) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
-          xd->mode_info_context->mbmi.mode == SPLITMV) {
-        for (i = 16; i < 24; ++i) {
-          xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-        }
-      } else {
-        for (i = 16; i < 24; i += 4) {
-          xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-        }
-      }
-      break;
-
-    case TX_16X16:
-      xd->mode_info_context->mbmi.nzcs[0] = xd->nzcs[0];
-      for (i = 16; i < 24; i += 4) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      break;
-
-    default:
-      break;
-  }
-}
-
-static void gather_nzcs_sb32(VP9_COMMON *const cm,
-                             MACROBLOCKD *xd) {
-  int i, j;
-  MODE_INFO *m = xd->mode_info_context;
-  int mis = cm->mode_info_stride;
-  vpx_memset(m->mbmi.nzcs, 0,
-             384 * sizeof(xd->mode_info_context->mbmi.nzcs[0]));
-  switch (xd->mode_info_context->mbmi.txfm_size) {
-    case TX_4X4:
-      for (i = 0; i < 96; ++i) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      break;
-
-    case TX_8X8:
-      for (i = 0; i < 96; i += 4) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      break;
-
-    case TX_16X16:
-      for (i = 0; i < 96; i += 16) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      break;
-
-    case TX_32X32:
-      xd->mode_info_context->mbmi.nzcs[0] = xd->nzcs[0];
-      for (i = 64; i < 96; i += 16) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      break;
-
-    default:
-      break;
-  }
-  for (i = 0; i < 2; ++i)
-    for (j = 0; j < 2; ++j) {
-      if (i == 0 && j == 0) continue;
-      vpx_memcpy((m + j + mis * i)->mbmi.nzcs, m->mbmi.nzcs,
-                 384 * sizeof(m->mbmi.nzcs[0]));
-    }
-}
-
-static void gather_nzcs_sb64(VP9_COMMON *const cm,
-                             MACROBLOCKD *xd) {
-  int i, j;
-  MODE_INFO *m = xd->mode_info_context;
-  int mis = cm->mode_info_stride;
-  vpx_memset(xd->mode_info_context->mbmi.nzcs, 0,
-             384 * sizeof(xd->mode_info_context->mbmi.nzcs[0]));
-  switch (xd->mode_info_context->mbmi.txfm_size) {
-    case TX_4X4:
-      for (i = 0; i < 384; ++i) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      break;
-
-    case TX_8X8:
-      for (i = 0; i < 384; i += 4) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      break;
-
-    case TX_16X16:
-      for (i = 0; i < 384; i += 16) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      break;
-
-    case TX_32X32:
-      for (i = 0; i < 384; i += 64) {
-        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];
-      }
-      break;
-
-    default:
-      break;
-  }
-  for (i = 0; i < 4; ++i)
-    for (j = 0; j < 4; ++j) {
-      if (i == 0 && j == 0) continue;
-      vpx_memcpy((m + j + mis * i)->mbmi.nzcs, m->mbmi.nzcs,
-                 384 * sizeof(m->mbmi.nzcs[0]));
-    }
-}
-#endif
-
-static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
-                              int output_enabled,
-                              int mb_row, int mb_col) {
+static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
+                              int output_enabled, int mi_row, int mi_col,
+                              BLOCK_SIZE_TYPE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
+  int n;
   MODE_INFO *mi = xd->mode_info_context;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  MB_MODE_INFO *mbmi = &mi->mbmi;
+  unsigned int segment_id = mbmi->segment_id;
   const int mis = cm->mode_info_stride;
-  unsigned char ref_pred_flag;
+  const int bwl = mi_width_log2(bsize);
+  const int bw = 1 << bwl, bh = 1 << mi_height_log2(bsize);
 
-  assert(!xd->mode_info_context->mbmi.sb_type);
-
-#ifdef ENC_DEBUG
-  enc_debug = (cpi->common.current_video_frame == 11 && cm->show_frame &&
-               mb_row == 8 && mb_col == 0 && output_enabled);
-  if (enc_debug)
-    printf("Encode MB %d %d output %d\n", mb_row, mb_col, output_enabled);
-#endif
   if (cm->frame_type == KEY_FRAME) {
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM && output_enabled) {
-      // Adjust the zbin based on this MB rate.
+    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
       adjust_act_zbin(cpi, x);
       vp9_update_zbin_extra(cpi, x);
     }
@@ -2083,16 +1993,17 @@
     // Increase zbin size to suppress noise
     cpi->zbin_mode_boost = 0;
     if (cpi->zbin_mode_boost_enabled) {
-      if (mbmi->ref_frame != INTRA_FRAME) {
+      if (mbmi->ref_frame[0] != INTRA_FRAME) {
         if (mbmi->mode == ZEROMV) {
-          if (mbmi->ref_frame != LAST_FRAME)
+          if (mbmi->ref_frame[0] != LAST_FRAME)
             cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
           else
             cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
-        } else if (mbmi->mode == SPLITMV)
+        } else if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
           cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST;
-        else
+        } else {
           cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+        }
       } else {
         cpi->zbin_mode_boost = INTRA_ZBIN_BOOST;
       }
@@ -2099,641 +2010,94 @@
     }
 
     vp9_update_zbin_extra(cpi, x);
-
-    // SET VARIOUS PREDICTION FLAGS
-
-    // Did the chosen reference frame match its predicted value.
-    ref_pred_flag = ((mbmi->ref_frame == vp9_get_pred_ref(cm, xd)));
-    vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);
   }
 
-  if (mbmi->ref_frame == INTRA_FRAME) {
-#if 0  // def ENC_DEBUG
-    if (enc_debug) {
-      printf("Mode %d skip %d tx_size %d\n", mbmi->mode, x->skip,
-             mbmi->txfm_size);
-    }
-#endif
-    if (mbmi->mode == B_PRED) {
-      vp9_encode_intra16x16mbuv(cm, x);
-      vp9_encode_intra4x4mby(x);
-    } else if (mbmi->mode == I8X8_PRED) {
-      vp9_encode_intra8x8mby(x);
-      vp9_encode_intra8x8mbuv(x);
-    } else {
-      vp9_encode_intra16x16mbuv(cm, x);
-      vp9_encode_intra16x16mby(cm, x);
-    }
-
+  if (mbmi->ref_frame[0] == INTRA_FRAME) {
+    vp9_encode_intra_block_y(cm, x, (bsize < BLOCK_SIZE_SB8X8) ?
+                                    BLOCK_SIZE_SB8X8 : bsize);
+    vp9_encode_intra_block_uv(cm, x, (bsize < BLOCK_SIZE_SB8X8) ?
+                                     BLOCK_SIZE_SB8X8 : bsize);
     if (output_enabled)
       sum_intra_stats(cpi, x);
   } else {
-    int ref_fb_idx;
-#ifdef ENC_DEBUG
-    if (enc_debug)
-      printf("Mode %d skip %d tx_size %d ref %d ref2 %d mv %d %d interp %d\n",
-             mbmi->mode, x->skip, mbmi->txfm_size,
-             mbmi->ref_frame, mbmi->second_ref_frame,
-             mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
-             mbmi->interp_filter);
-#endif
+    int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[0])];
+    YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[idx];
+    YV12_BUFFER_CONFIG *second_ref_fb = NULL;
+    if (mbmi->ref_frame[1] > 0) {
+      idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[1])];
+      second_ref_fb = &cm->yv12_fb[idx];
+    }
 
     assert(cm->frame_type != KEY_FRAME);
 
-    if (mbmi->ref_frame == LAST_FRAME)
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
-    else if (mbmi->ref_frame == GOLDEN_FRAME)
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
-    else
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
+    setup_pre_planes(xd, ref_fb, second_ref_fb,
+                     mi_row, mi_col, xd->scale_factor, xd->scale_factor_uv);
 
-    setup_pred_block(&xd->pre,
-                     &cpi->common.yv12_fb[ref_fb_idx],
-                     mb_row, mb_col,
-                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);
-
-    if (mbmi->second_ref_frame > 0) {
-      int second_ref_fb_idx;
-
-      if (mbmi->second_ref_frame == LAST_FRAME)
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
-      else if (mbmi->second_ref_frame == GOLDEN_FRAME)
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
-      else
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
-
-      setup_pred_block(&xd->second_pre,
-                       &cpi->common.yv12_fb[second_ref_fb_idx],
-                       mb_row, mb_col,
-                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);
-    }
-
-    if (!x->skip) {
-      vp9_encode_inter16x16(cm, x, mb_row, mb_col);
-
-      // Clear mb_skip_coeff if mb_no_coeff_skip is not set
-      if (!cpi->common.mb_no_coeff_skip)
-        mbmi->mb_skip_coeff = 0;
-
-    } else {
-      vp9_build_inter16x16_predictors_mb(xd,
-                                         xd->dst.y_buffer,
-                                         xd->dst.u_buffer,
-                                         xd->dst.v_buffer,
-                                         xd->dst.y_stride,
-                                         xd->dst.uv_stride,
-                                         mb_row, mb_col);
-#if CONFIG_COMP_INTERINTRA_PRED
-      if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
-        vp9_build_interintra_16x16_predictors_mb(xd,
-                                                 xd->dst.y_buffer,
-                                                 xd->dst.u_buffer,
-                                                 xd->dst.v_buffer,
-                                                 xd->dst.y_stride,
-                                                 xd->dst.uv_stride);
-      }
-#endif
-    }
+    vp9_build_inter_predictors_sb(xd, mi_row, mi_col,
+                                  bsize < BLOCK_SIZE_SB8X8 ? BLOCK_SIZE_SB8X8
+                                                           : bsize);
   }
 
-  if (!x->skip) {
-#ifdef ENC_DEBUG
-    if (enc_debug) {
-      int i, j;
-      printf("\n");
-      printf("qcoeff\n");
-      for (i = 0; i < 384; i++) {
-        printf("%3d ", xd->qcoeff[i]);
-        if (i % 16 == 15) printf("\n");
-      }
-      printf("\n");
-      printf("predictor\n");
-      for (i = 0; i < 384; i++) {
-        printf("%3d ", xd->predictor[i]);
-        if (i % 16 == 15) printf("\n");
-      }
-      printf("\n");
-      printf("src_diff\n");
-      for (i = 0; i < 384; i++) {
-        printf("%3d ", x->src_diff[i]);
-        if (i % 16 == 15) printf("\n");
-      }
-      printf("\n");
-      printf("diff\n");
-      for (i = 0; i < 384; i++) {
-        printf("%3d ", xd->block[0].diff[i]);
-        if (i % 16 == 15) printf("\n");
-      }
-      printf("\n");
-      printf("final y\n");
-      for (i = 0; i < 16; i++) {
-        for (j = 0; j < 16; j++)
-          printf("%3d ", xd->dst.y_buffer[i * xd->dst.y_stride + j]);
-        printf("\n");
-      }
-      printf("\n");
-      printf("final u\n");
-      for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++)
-          printf("%3d ", xd->dst.u_buffer[i * xd->dst.uv_stride + j]);
-        printf("\n");
-      }
-      printf("\n");
-      printf("final v\n");
-      for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++)
-          printf("%3d ", xd->dst.v_buffer[i * xd->dst.uv_stride + j]);
-        printf("\n");
-      }
-      fflush(stdout);
-    }
-#endif
-
-#if CONFIG_CODE_NONZEROCOUNT
-    gather_nzcs_mb16(cm, xd);
-#endif
-    vp9_tokenize_mb(cpi, xd, t, !output_enabled);
-
+  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) {
+    vp9_tokenize_sb(cpi, xd, t, !output_enabled,
+                    (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+  } else if (!x->skip) {
+    vp9_encode_sb(cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+    vp9_tokenize_sb(cpi, xd, t, !output_enabled,
+                    (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
   } else {
     // FIXME(rbultje): not tile-aware (mi - 1)
-    int mb_skip_context = cpi->common.mb_no_coeff_skip ?
-      (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff : 0;
+    int mb_skip_context =
+        (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff;
 
-    if (cm->mb_no_coeff_skip) {
-      mbmi->mb_skip_coeff = 1;
-      if (output_enabled)
-        cpi->skip_true_count[mb_skip_context]++;
-      vp9_reset_mb_tokens_context(xd);
-    } else {
-      vp9_stuff_mb(cpi, xd, t, !output_enabled);
-      mbmi->mb_skip_coeff = 0;
-      if (output_enabled)
-        cpi->skip_false_count[mb_skip_context]++;
-    }
-  }
-
-  if (output_enabled) {
-    int segment_id = mbmi->segment_id;
-    if (cpi->common.txfm_mode == TX_MODE_SELECT &&
-        !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) ||
-          (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_SKIP)))) {
-      assert(mbmi->txfm_size <= TX_16X16);
-      if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&
-          mbmi->mode != SPLITMV) {
-        cpi->txfm_count_16x16p[mbmi->txfm_size]++;
-      } else if (mbmi->mode == I8X8_PRED ||
-                 (mbmi->mode == SPLITMV &&
-                  mbmi->partitioning != PARTITIONING_4X4)) {
-        cpi->txfm_count_8x8p[mbmi->txfm_size]++;
-      }
-    } else if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&
-        mbmi->mode != SPLITMV && cpi->common.txfm_mode >= ALLOW_16X16) {
-      mbmi->txfm_size = TX_16X16;
-    } else if (mbmi->mode != B_PRED &&
-               !(mbmi->mode == SPLITMV &&
-                 mbmi->partitioning == PARTITIONING_4X4) &&
-               cpi->common.txfm_mode >= ALLOW_8X8) {
-      mbmi->txfm_size = TX_8X8;
-    } else {
-      mbmi->txfm_size = TX_4X4;
-    }
-  }
-}
-
-static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,
-                                int output_enabled, int mb_row, int mb_col) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const uint8_t *src = x->src.y_buffer;
-  uint8_t *dst = xd->dst.y_buffer;
-  const uint8_t *usrc = x->src.u_buffer;
-  uint8_t *udst = xd->dst.u_buffer;
-  const uint8_t *vsrc = x->src.v_buffer;
-  uint8_t *vdst = xd->dst.v_buffer;
-  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
-  unsigned char ref_pred_flag;
-  MODE_INFO *mi = x->e_mbd.mode_info_context;
-  unsigned int segment_id = mi->mbmi.segment_id;
-  const int mis = cm->mode_info_stride;
-
-#ifdef ENC_DEBUG
-  enc_debug = (cpi->common.current_video_frame == 11 && cm->show_frame &&
-               mb_row == 8 && mb_col == 0 && output_enabled);
-  if (enc_debug) {
-    printf("Encode SB32 %d %d output %d\n", mb_row, mb_col, output_enabled);
-    printf("Mode %d skip %d tx_size %d ref %d ref2 %d mv %d %d interp %d\n",
-           mi->mbmi.mode, x->skip, mi->mbmi.txfm_size,
-           mi->mbmi.ref_frame, mi->mbmi.second_ref_frame,
-           mi->mbmi.mv[0].as_mv.row, mi->mbmi.mv[0].as_mv.col,
-           mi->mbmi.interp_filter);
-  }
-#endif
-  if (cm->frame_type == KEY_FRAME) {
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
-      adjust_act_zbin(cpi, x);
-      vp9_update_zbin_extra(cpi, x);
-    }
-  } else {
-    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, cm);
-
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
-      // Adjust the zbin based on this MB rate.
-      adjust_act_zbin(cpi, x);
-    }
-
-    // Experimental code. Special case for gf and arf zeromv modes.
-    // Increase zbin size to suppress noise
-    cpi->zbin_mode_boost = 0;
-    if (cpi->zbin_mode_boost_enabled) {
-      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
-        if (xd->mode_info_context->mbmi.mode == ZEROMV) {
-          if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
-            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
-          else
-            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
-        } else if (xd->mode_info_context->mbmi.mode == SPLITMV)
-          cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST;
-        else
-          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
-      } else {
-        cpi->zbin_mode_boost = INTRA_ZBIN_BOOST;
-      }
-    }
-
-    vp9_update_zbin_extra(cpi, x);
-
-    // SET VARIOUS PREDICTION FLAGS
-    // Did the chosen reference frame match its predicted value.
-    ref_pred_flag = ((xd->mode_info_context->mbmi.ref_frame ==
-                      vp9_get_pred_ref(cm, xd)));
-    vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);
-  }
-
-
-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-    vp9_build_intra_predictors_sby_s(&x->e_mbd);
-    vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
+    mbmi->mb_skip_coeff = 1;
     if (output_enabled)
-      sum_intra_stats(cpi, x);
-  } else {
-    int ref_fb_idx;
-
-    assert(cm->frame_type != KEY_FRAME);
-
-    if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
-    else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
-    else
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
-
-    setup_pred_block(&xd->pre,
-                     &cpi->common.yv12_fb[ref_fb_idx],
-                     mb_row, mb_col,
-                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);
-
-    if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-      int second_ref_fb_idx;
-
-      if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
-      else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
-      else
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
-
-      setup_pred_block(&xd->second_pre,
-                       &cpi->common.yv12_fb[second_ref_fb_idx],
-                       mb_row, mb_col,
-                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);
-    }
-
-    vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
-                                       xd->dst.u_buffer, xd->dst.v_buffer,
-                                       xd->dst.y_stride, xd->dst.uv_stride,
-                                       mb_row, mb_col);
+      cm->fc.mbskip_count[mb_skip_context][1]++;
+    vp9_reset_sb_tokens_context(xd,
+                 (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
   }
 
-  if (!x->skip) {
-    vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride,
-                         dst, dst_y_stride);
-    vp9_subtract_sbuv_s_c(x->src_diff,
-                          usrc, vsrc, src_uv_stride,
-                          udst, vdst, dst_uv_stride);
-    switch (mi->mbmi.txfm_size) {
-      case TX_32X32:
-        vp9_transform_sby_32x32(x);
-        vp9_transform_sbuv_16x16(x);
-        vp9_quantize_sby_32x32(x);
-        vp9_quantize_sbuv_16x16(x);
-        if (x->optimize) {
-          vp9_optimize_sby_32x32(cm, x);
-          vp9_optimize_sbuv_16x16(cm, x);
-        }
-        vp9_inverse_transform_sby_32x32(xd);
-        vp9_inverse_transform_sbuv_16x16(xd);
-        break;
-      case TX_16X16:
-        vp9_transform_sby_16x16(x);
-        vp9_transform_sbuv_16x16(x);
-        vp9_quantize_sby_16x16(x);
-        vp9_quantize_sbuv_16x16(x);
-        if (x->optimize) {
-          vp9_optimize_sby_16x16(cm, x);
-          vp9_optimize_sbuv_16x16(cm, x);
-        }
-        vp9_inverse_transform_sby_16x16(xd);
-        vp9_inverse_transform_sbuv_16x16(xd);
-        break;
-      case TX_8X8:
-        vp9_transform_sby_8x8(x);
-        vp9_transform_sbuv_8x8(x);
-        vp9_quantize_sby_8x8(x);
-        vp9_quantize_sbuv_8x8(x);
-        if (x->optimize) {
-          vp9_optimize_sby_8x8(cm, x);
-          vp9_optimize_sbuv_8x8(cm, x);
-        }
-        vp9_inverse_transform_sby_8x8(xd);
-        vp9_inverse_transform_sbuv_8x8(xd);
-        break;
-      case TX_4X4:
-        vp9_transform_sby_4x4(x);
-        vp9_transform_sbuv_4x4(x);
-        vp9_quantize_sby_4x4(x);
-        vp9_quantize_sbuv_4x4(x);
-        if (x->optimize) {
-          vp9_optimize_sby_4x4(cm, x);
-          vp9_optimize_sbuv_4x4(cm, x);
-        }
-        vp9_inverse_transform_sby_4x4(xd);
-        vp9_inverse_transform_sbuv_4x4(xd);
-        break;
-      default: assert(0);
-    }
-    vp9_recon_sby_s_c(xd, dst);
-    vp9_recon_sbuv_s_c(xd, udst, vdst);
-#if CONFIG_CODE_NONZEROCOUNT
-    gather_nzcs_sb32(cm, xd);
-#endif
-
-    vp9_tokenize_sb(cpi, xd, t, !output_enabled);
-  } else {
-    // FIXME(rbultje): not tile-aware (mi - 1)
-    int mb_skip_context = cm->mb_no_coeff_skip ?
-          (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff : 0;
-
-    mi->mbmi.mb_skip_coeff = 1;
-    if (cm->mb_no_coeff_skip) {
-      if (output_enabled)
-        cpi->skip_true_count[mb_skip_context]++;
-      vp9_reset_sb_tokens_context(xd);
-    } else {
-      vp9_stuff_sb(cpi, xd, t, !output_enabled);
-      if (output_enabled)
-        cpi->skip_false_count[mb_skip_context]++;
-    }
-  }
-
   // copy skip flag on all mb_mode_info contexts in this SB
   // if this was a skip at this txfm size
-  if (mb_col < cm->mb_cols - 1)
-    mi[1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
-  if (mb_row < cm->mb_rows - 1) {
-    mi[mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
-    if (mb_col < cm->mb_cols - 1)
-      mi[mis + 1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
+  for (n = 1; n < bw * bh; n++) {
+    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
+    if (mi_col + x_idx < cm->mi_cols && mi_row + y_idx < cm->mi_rows)
+      mi[x_idx + y_idx * mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
   }
 
   if (output_enabled) {
     if (cm->txfm_mode == TX_MODE_SELECT &&
-        !((cm->mb_no_coeff_skip && mi->mbmi.mb_skip_coeff) ||
-          (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
-      cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++;
-    } else {
-      TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_32X32 : cm->txfm_mode;
-      mi->mbmi.txfm_size = sz;
-      if (mb_col < cm->mb_cols - 1)
-        mi[1].mbmi.txfm_size = sz;
-      if (mb_row < cm->mb_rows - 1) {
-        mi[mis].mbmi.txfm_size = sz;
-        if (mb_col < cm->mb_cols - 1)
-          mi[mis + 1].mbmi.txfm_size = sz;
-      }
-    }
-  }
-}
-
-static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,
-                                int output_enabled, int mb_row, int mb_col) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const uint8_t *src = x->src.y_buffer;
-  uint8_t *dst = xd->dst.y_buffer;
-  const uint8_t *usrc = x->src.u_buffer;
-  uint8_t *udst = xd->dst.u_buffer;
-  const uint8_t *vsrc = x->src.v_buffer;
-  uint8_t *vdst = xd->dst.v_buffer;
-  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
-  unsigned char ref_pred_flag;
-  int n;
-  MODE_INFO *mi = x->e_mbd.mode_info_context;
-  unsigned int segment_id = mi->mbmi.segment_id;
-  const int mis = cm->mode_info_stride;
-
-#ifdef ENC_DEBUG
-  enc_debug = (cpi->common.current_video_frame == 11 && cm->show_frame &&
-               mb_row == 8 && mb_col == 0 && output_enabled);
-  if (enc_debug)
-    printf("Encode SB64 %d %d output %d\n", mb_row, mb_col, output_enabled);
-#endif
-  if (cm->frame_type == KEY_FRAME) {
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
-      adjust_act_zbin(cpi, x);
-      vp9_update_zbin_extra(cpi, x);
-    }
-  } else {
-    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, cm);
-
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
-      // Adjust the zbin based on this MB rate.
-      adjust_act_zbin(cpi, x);
-    }
-
-    // Experimental code. Special case for gf and arf zeromv modes.
-    // Increase zbin size to suppress noise
-    cpi->zbin_mode_boost = 0;
-    if (cpi->zbin_mode_boost_enabled) {
-      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
-        if (xd->mode_info_context->mbmi.mode == ZEROMV) {
-          if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
-            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
-          else
-            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
-        } else if (xd->mode_info_context->mbmi.mode == SPLITMV) {
-          cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST;
-        } else {
-          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
-        }
+        mbmi->sb_type >= BLOCK_SIZE_SB8X8 &&
+        !(mbmi->ref_frame[0] != INTRA_FRAME && (mbmi->mb_skip_coeff ||
+          vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
+      const int context = vp9_get_pred_context(cm, xd, PRED_TX_SIZE);
+      if (bsize >= BLOCK_SIZE_SB32X32) {
+        cm->fc.tx_count_32x32p[context][mbmi->txfm_size]++;
+      } else if (bsize >= BLOCK_SIZE_MB16X16) {
+        cm->fc.tx_count_16x16p[context][mbmi->txfm_size]++;
       } else {
-        cpi->zbin_mode_boost = INTRA_ZBIN_BOOST;
+        cm->fc.tx_count_8x8p[context][mbmi->txfm_size]++;
       }
-    }
-
-    vp9_update_zbin_extra(cpi, x);
-
-    // Did the chosen reference frame match its predicted value.
-    ref_pred_flag = ((xd->mode_info_context->mbmi.ref_frame ==
-                      vp9_get_pred_ref(cm, xd)));
-    vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);
-  }
-
-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-    vp9_build_intra_predictors_sb64y_s(&x->e_mbd);
-    vp9_build_intra_predictors_sb64uv_s(&x->e_mbd);
-    if (output_enabled)
-      sum_intra_stats(cpi, x);
-  } else {
-    int ref_fb_idx;
-
-    assert(cm->frame_type != KEY_FRAME);
-
-    if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
-    else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
-    else
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
-
-    setup_pred_block(&xd->pre,
-                     &cpi->common.yv12_fb[ref_fb_idx],
-                     mb_row, mb_col,
-                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);
-
-    if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-      int second_ref_fb_idx;
-
-      if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
-      else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
-      else
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
-
-      setup_pred_block(&xd->second_pre,
-                       &cpi->common.yv12_fb[second_ref_fb_idx],
-                       mb_row, mb_col,
-                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);
-    }
-
-    vp9_build_inter64x64_predictors_sb(xd, xd->dst.y_buffer,
-                                       xd->dst.u_buffer, xd->dst.v_buffer,
-                                       xd->dst.y_stride, xd->dst.uv_stride,
-                                       mb_row, mb_col);
-  }
-
-  if (!x->skip) {
-    vp9_subtract_sb64y_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride);
-    vp9_subtract_sb64uv_s_c(x->src_diff, usrc, vsrc, src_uv_stride,
-                            udst, vdst, dst_uv_stride);
-
-    switch (xd->mode_info_context->mbmi.txfm_size) {
-      case TX_32X32:
-        vp9_transform_sb64y_32x32(x);
-        vp9_transform_sb64uv_32x32(x);
-        vp9_quantize_sb64y_32x32(x);
-        vp9_quantize_sb64uv_32x32(x);
-        if (x->optimize) {
-          vp9_optimize_sb64y_32x32(cm, x);
-          vp9_optimize_sb64uv_32x32(cm, x);
-        }
-        vp9_inverse_transform_sb64y_32x32(xd);
-        vp9_inverse_transform_sb64uv_32x32(xd);
-        break;
-      case TX_16X16:
-        vp9_transform_sb64y_16x16(x);
-        vp9_transform_sb64uv_16x16(x);
-        vp9_quantize_sb64y_16x16(x);
-        vp9_quantize_sb64uv_16x16(x);
-        if (x->optimize) {
-          vp9_optimize_sb64y_16x16(cm, x);
-          vp9_optimize_sb64uv_16x16(cm, x);
-        }
-        vp9_inverse_transform_sb64y_16x16(xd);
-        vp9_inverse_transform_sb64uv_16x16(xd);
-        break;
-      case TX_8X8:
-        vp9_transform_sb64y_8x8(x);
-        vp9_transform_sb64uv_8x8(x);
-        vp9_quantize_sb64y_8x8(x);
-        vp9_quantize_sb64uv_8x8(x);
-        if (x->optimize) {
-          vp9_optimize_sb64y_8x8(cm, x);
-          vp9_optimize_sb64uv_8x8(cm, x);
-        }
-        vp9_inverse_transform_sb64y_8x8(xd);
-        vp9_inverse_transform_sb64uv_8x8(xd);
-        break;
-      case TX_4X4:
-        vp9_transform_sb64y_4x4(x);
-        vp9_transform_sb64uv_4x4(x);
-        vp9_quantize_sb64y_4x4(x);
-        vp9_quantize_sb64uv_4x4(x);
-        if (x->optimize) {
-          vp9_optimize_sb64y_4x4(cm, x);
-          vp9_optimize_sb64uv_4x4(cm, x);
-        }
-        vp9_inverse_transform_sb64y_4x4(xd);
-        vp9_inverse_transform_sb64uv_4x4(xd);
-        break;
-      default: assert(0);
-    }
-    vp9_recon_sb64y_s_c(xd, dst);
-    vp9_recon_sb64uv_s_c(&x->e_mbd, udst, vdst);
-#if CONFIG_CODE_NONZEROCOUNT
-    gather_nzcs_sb64(cm, &x->e_mbd);
-#endif
-    vp9_tokenize_sb64(cpi, &x->e_mbd, t, !output_enabled);
-  } else {
-    // FIXME(rbultje): not tile-aware (mi - 1)
-    int mb_skip_context = cpi->common.mb_no_coeff_skip ?
-        (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff : 0;
-
-    xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-    if (cm->mb_no_coeff_skip) {
-      if (output_enabled)
-        cpi->skip_true_count[mb_skip_context]++;
-      vp9_reset_sb64_tokens_context(xd);
     } else {
-      vp9_stuff_sb64(cpi, xd, t, !output_enabled);
-      if (output_enabled)
-        cpi->skip_false_count[mb_skip_context]++;
-    }
-  }
-
-  // copy skip flag on all mb_mode_info contexts in this SB
-  // if this was a skip at this txfm size
-  for (n = 1; n < 16; n++) {
-    const int x_idx = n & 3, y_idx = n >> 2;
-    if (mb_col + x_idx < cm->mb_cols && mb_row + y_idx < cm->mb_rows)
-      mi[x_idx + y_idx * mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
-  }
-
-  if (output_enabled) {
-    if (cm->txfm_mode == TX_MODE_SELECT &&
-        !((cm->mb_no_coeff_skip && mi->mbmi.mb_skip_coeff) ||
-          (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
-      cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++;
-    } else {
       int x, y;
       TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_32X32 : cm->txfm_mode;
-      for (y = 0; y < 4; y++) {
-        for (x = 0; x < 4; x++) {
-          if (mb_col + x < cm->mb_cols && mb_row + y < cm->mb_rows) {
+       // The new intra coding scheme requires no change of transform size
+      if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {
+        if (sz == TX_32X32 && bsize < BLOCK_SIZE_SB32X32)
+          sz = TX_16X16;
+        if (sz == TX_16X16 && bsize < BLOCK_SIZE_MB16X16)
+          sz = TX_8X8;
+        if (sz == TX_8X8 && bsize < BLOCK_SIZE_SB8X8)
+          sz = TX_4X4;
+      } else if (bsize >= BLOCK_SIZE_SB8X8) {
+        sz = mbmi->txfm_size;
+      } else {
+        sz = TX_4X4;
+      }
+
+      for (y = 0; y < bh; y++) {
+        for (x = 0; x < bw; x++) {
+          if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows) {
             mi[mis * y + x].mbmi.txfm_size = sz;
           }
         }
--- a/vp9/encoder/vp9_encodeframe.h
+++ b/vp9/encoder/vp9_encodeframe.h
@@ -13,9 +13,12 @@
 #define VP9_ENCODER_VP9_ENCODEFRAME_H_
 
 struct macroblock;
+struct yv12_buffer_config;
 
 void vp9_build_block_offsets(struct macroblock *x);
 
-void vp9_setup_block_ptrs(struct macroblock *x);
+void vp9_setup_src_planes(struct macroblock *x,
+                          const struct yv12_buffer_config *src,
+                          int mb_row, int mb_col);
 
 #endif  // VP9_ENCODER_VP9_ENCODEFRAME_H_
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -19,217 +19,15 @@
 int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   (void) cpi;
-
+  mbmi->mode = DC_PRED;
+  mbmi->ref_frame[0] = INTRA_FRAME;
   if (use_16x16_pred) {
-    mbmi->mode = DC_PRED;
-    mbmi->uv_mode = DC_PRED;
-    mbmi->ref_frame = INTRA_FRAME;
-
-    vp9_encode_intra16x16mby(&cpi->common, x);
+    mbmi->txfm_size = mbmi->sb_type >= BLOCK_SIZE_MB16X16 ? TX_16X16 : TX_8X8;
+    vp9_encode_intra_block_y(&cpi->common, x, mbmi->sb_type);
   } else {
-    int i;
-
-    for (i = 0; i < 16; i++) {
-      x->e_mbd.block[i].bmi.as_mode.first = B_DC_PRED;
-      vp9_encode_intra4x4block(x, i);
-    }
+    mbmi->txfm_size = TX_4X4;
+    vp9_encode_intra_block_y(&cpi->common, x, mbmi->sb_type);
   }
 
-  return vp9_get_mb_ss(x->src_diff);
-}
-
-void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) {
-  BLOCKD *b = &x->e_mbd.block[ib];
-  BLOCK *be = &x->block[ib];
-  TX_TYPE tx_type;
-
-#if CONFIG_NEWBINTRAMODES
-  b->bmi.as_mode.context = vp9_find_bpred_context(&x->e_mbd, b);
-#endif
-
-  vp9_intra4x4_predict(&x->e_mbd, b, b->bmi.as_mode.first, b->predictor);
-  vp9_subtract_b(be, b, 16);
-
-  tx_type = get_tx_type_4x4(&x->e_mbd, ib);
-  if (tx_type != DCT_DCT) {
-    vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
-    vp9_ht_quantize_b_4x4(x, ib, tx_type);
-    vp9_short_iht4x4(b->dqcoeff, b->diff, 16, tx_type);
-  } else {
-    x->fwd_txm4x4(be->src_diff, be->coeff, 32);
-    x->quantize_b_4x4(x, ib);
-    vp9_inverse_transform_b_4x4(&x->e_mbd, x->e_mbd.eobs[ib],
-                                b->dqcoeff, b->diff, 32);
-  }
-
-  vp9_recon_b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-}
-
-void vp9_encode_intra4x4mby(MACROBLOCK *mb) {
-  int i;
-
-  for (i = 0; i < 16; i++)
-    vp9_encode_intra4x4block(mb, i);
-}
-
-void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  BLOCK *b = &x->block[0];
-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
-
-  vp9_build_intra_predictors_mby(xd);
-
-  vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);
-
-  switch (tx_size) {
-    case TX_16X16:
-      vp9_transform_mby_16x16(x);
-      vp9_quantize_mby_16x16(x);
-      if (x->optimize)
-        vp9_optimize_mby_16x16(cm, x);
-      vp9_inverse_transform_mby_16x16(xd);
-      break;
-    case TX_8X8:
-      vp9_transform_mby_8x8(x);
-      vp9_quantize_mby_8x8(x);
-      if (x->optimize)
-        vp9_optimize_mby_8x8(cm, x);
-      vp9_inverse_transform_mby_8x8(xd);
-      break;
-    default:
-      vp9_transform_mby_4x4(x);
-      vp9_quantize_mby_4x4(x);
-      if (x->optimize)
-        vp9_optimize_mby_4x4(cm, x);
-      vp9_inverse_transform_mby_4x4(xd);
-      break;
-  }
-
-  vp9_recon_mby(xd);
-}
-
-void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
-
-  vp9_build_intra_predictors_mbuv(xd);
-
-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                    xd->predictor, x->src.uv_stride);
-
-  switch (tx_size) {
-    case TX_4X4:
-      vp9_transform_mbuv_4x4(x);
-      vp9_quantize_mbuv_4x4(x);
-      if (x->optimize)
-        vp9_optimize_mbuv_4x4(cm, x);
-      vp9_inverse_transform_mbuv_4x4(xd);
-      break;
-    default:  // 16x16 or 8x8
-      vp9_transform_mbuv_8x8(x);
-      vp9_quantize_mbuv_8x8(x);
-      if (x->optimize)
-        vp9_optimize_mbuv_8x8(cm, x);
-      vp9_inverse_transform_mbuv_8x8(xd);
-      break;
-    }
-
-  vp9_recon_intra_mbuv(xd);
-}
-
-void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  BLOCKD *b = &xd->block[ib];
-  BLOCK *be = &x->block[ib];
-  const int iblock[4] = {0, 1, 4, 5};
-  int i;
-  TX_TYPE tx_type;
-
-  vp9_intra8x8_predict(xd, b, b->bmi.as_mode.first, b->predictor);
-  // generate residual blocks
-  vp9_subtract_4b_c(be, b, 16);
-
-  if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
-    int idx = (ib & 0x02) ? (ib + 2) : ib;
-
-    tx_type = get_tx_type_8x8(xd, ib);
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type);
-      x->quantize_b_8x8(x, idx, tx_type);
-      vp9_short_iht8x8(xd->block[idx].dqcoeff, xd->block[ib].diff,
-                            16, tx_type);
-    } else {
-      x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32);
-      x->quantize_b_8x8(x, idx, DCT_DCT);
-      vp9_short_idct8x8(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);
-    }
-  } else {
-    for (i = 0; i < 4; i++) {
-      b = &xd->block[ib + iblock[i]];
-      be = &x->block[ib + iblock[i]];
-      tx_type = get_tx_type_4x4(xd, ib + iblock[i]);
-      if (tx_type != DCT_DCT) {
-        vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
-        vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type);
-        vp9_short_iht4x4(b->dqcoeff, b->diff, 16, tx_type);
-      } else if (!(i & 1) &&
-                 get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) {
-        x->fwd_txm8x4(be->src_diff, be->coeff, 32);
-        x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1);
-        vp9_inverse_transform_b_4x4(xd, xd->eobs[ib + iblock[i]],
-                                    b->dqcoeff, b->diff, 32);
-        vp9_inverse_transform_b_4x4(xd, xd->eobs[ib + iblock[i] + 1],
-                                    (b + 1)->dqcoeff, (b + 1)->diff, 32);
-        i++;
-      } else {
-        x->fwd_txm4x4(be->src_diff, be->coeff, 32);
-        x->quantize_b_4x4(x, ib + iblock[i]);
-        vp9_inverse_transform_b_4x4(xd, xd->eobs[ib + iblock[i]],
-                                    b->dqcoeff, b->diff, 32);
-      }
-    }
-  }
-
-  // reconstruct submacroblock
-  for (i = 0; i < 4; i++) {
-    b = &xd->block[ib + iblock[i]];
-    vp9_recon_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,
-                  b->dst_stride);
-  }
-}
-
-void vp9_encode_intra8x8mby(MACROBLOCK *x) {
-  int i;
-
-  for (i = 0; i < 4; i++)
-    vp9_encode_intra8x8(x, vp9_i8x8_block[i]);
-}
-
-static void encode_intra_uv4x4(MACROBLOCK *x, int ib, int mode) {
-  BLOCKD *b = &x->e_mbd.block[ib];
-  BLOCK *be = &x->block[ib];
-
-  vp9_intra_uv4x4_predict(&x->e_mbd, b, mode, b->predictor);
-
-  vp9_subtract_b(be, b, 8);
-
-  x->fwd_txm4x4(be->src_diff, be->coeff, 16);
-  x->quantize_b_4x4(x, ib);
-  vp9_inverse_transform_b_4x4(&x->e_mbd, x->e_mbd.eobs[ib],
-                              b->dqcoeff, b->diff, 16);
-
-  vp9_recon_uv_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,
-                   b->dst_stride);
-}
-
-void vp9_encode_intra8x8mbuv(MACROBLOCK *x) {
-  int i;
-
-  for (i = 0; i < 4; i++) {
-    BLOCKD *b = &x->e_mbd.block[vp9_i8x8_block[i]];
-    int mode = b->bmi.as_mode.first;
-
-    encode_intra_uv4x4(x, i + 16, mode);  // u
-    encode_intra_uv4x4(x, i + 20, mode);  // v
-  }
+  return vp9_get_mb_ss(x->plane[0].src_diff);
 }
--- a/vp9/encoder/vp9_encodeintra.h
+++ b/vp9/encoder/vp9_encodeintra.h
@@ -14,12 +14,9 @@
 #include "vp9/encoder/vp9_onyx_int.h"
 
 int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);
-void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x);
-void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x);
-void vp9_encode_intra4x4mby(MACROBLOCK *mb);
-void vp9_encode_intra4x4block(MACROBLOCK *x, int ib);
-void vp9_encode_intra8x8mby(MACROBLOCK *x);
-void vp9_encode_intra8x8mbuv(MACROBLOCK *x);
-void vp9_encode_intra8x8(MACROBLOCK *x, int ib);
+void vp9_encode_intra_block_y(VP9_COMMON *const cm, MACROBLOCK *mb,
+                              BLOCK_SIZE_TYPE bs);
+void vp9_encode_intra_block_uv(VP9_COMMON *const cm, MACROBLOCK *mb,
+                               BLOCK_SIZE_TYPE bs);
 
 #endif  // VP9_ENCODER_VP9_ENCODEINTRA_H_
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -20,481 +20,55 @@
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9_rtcd.h"
 
-void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) {
-  uint8_t *src_ptr = (*(be->base_src) + be->src);
-  int16_t *diff_ptr = be->src_diff;
-  uint8_t *pred_ptr = bd->predictor;
-  int src_stride = be->src_stride;
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
 
+void vp9_subtract_block(int rows, int cols,
+                        int16_t *diff_ptr, int diff_stride,
+                        const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *pred_ptr, int pred_stride) {
   int r, c;
 
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++)
+  for (r = 0; r < rows; r++) {
+    for (c = 0; c < cols; c++)
       diff_ptr[c] = src_ptr[c] - pred_ptr[c];
 
-    diff_ptr += pitch;
-    pred_ptr += pitch;
+    diff_ptr += diff_stride;
+    pred_ptr += pred_stride;
     src_ptr  += src_stride;
   }
 }
 
-void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch) {
-  uint8_t *src_ptr = (*(be->base_src) + be->src);
-  int16_t *diff_ptr = be->src_diff;
-  uint8_t *pred_ptr = bd->predictor;
-  int src_stride = be->src_stride;
-  int r, c;
 
-  for (r = 0; r < 8; r++) {
-    for (c = 0; c < 8; c++)
-      diff_ptr[c] = src_ptr[c] - pred_ptr[c];
+static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) {
+  struct macroblock_plane *const p = &x->plane[plane];
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int bw = plane_block_width(bsize, pd);
+  const int bh = plane_block_height(bsize, pd);
 
-    diff_ptr += pitch;
-    pred_ptr += pitch;
-    src_ptr  += src_stride;
-  }
+  vp9_subtract_block(bh, bw, p->src_diff, bw,
+                     p->src.buf, p->src.stride,
+                     pd->dst.buf, pd->dst.stride);
 }
 
-void vp9_subtract_mbuv_s_c(int16_t *diff, const uint8_t *usrc,
-                           const uint8_t *vsrc, int src_stride,
-                           const uint8_t *upred,
-                           const uint8_t *vpred, int dst_stride) {
-  int16_t *udiff = diff + 256;
-  int16_t *vdiff = diff + 320;
-  int r, c;
-
-  for (r = 0; r < 8; r++) {
-    for (c = 0; c < 8; c++)
-      udiff[c] = usrc[c] - upred[c];
-
-    udiff += 8;
-    upred += dst_stride;
-    usrc  += src_stride;
-  }
-
-  for (r = 0; r < 8; r++) {
-    for (c = 0; c < 8; c++) {
-      vdiff[c] = vsrc[c] - vpred[c];
-    }
-
-    vdiff += 8;
-    vpred += dst_stride;
-    vsrc  += src_stride;
-  }
+void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+  subtract_plane(x, bsize, 0);
 }
 
-void vp9_subtract_mbuv_c(int16_t *diff, uint8_t *usrc,
-                         uint8_t *vsrc, uint8_t *pred, int stride) {
-  uint8_t *upred = pred + 256;
-  uint8_t *vpred = pred + 320;
-
-  vp9_subtract_mbuv_s_c(diff, usrc, vsrc, stride, upred, vpred, 8);
-}
-
-void vp9_subtract_mby_s_c(int16_t *diff, const uint8_t *src, int src_stride,
-                          const uint8_t *pred, int dst_stride) {
-  int r, c;
-
-  for (r = 0; r < 16; r++) {
-    for (c = 0; c < 16; c++)
-      diff[c] = src[c] - pred[c];
-
-    diff += 16;
-    pred += dst_stride;
-    src  += src_stride;
-  }
-}
-
-void vp9_subtract_sby_s_c(int16_t *diff, const uint8_t *src, int src_stride,
-                          const uint8_t *pred, int dst_stride) {
-  int r, c;
-
-  for (r = 0; r < 32; r++) {
-    for (c = 0; c < 32; c++)
-      diff[c] = src[c] - pred[c];
-
-    diff += 32;
-    pred += dst_stride;
-    src  += src_stride;
-  }
-}
-
-void vp9_subtract_sbuv_s_c(int16_t *diff, const uint8_t *usrc,
-                           const uint8_t *vsrc, int src_stride,
-                           const uint8_t *upred,
-                           const uint8_t *vpred, int dst_stride) {
-  int16_t *udiff = diff + 1024;
-  int16_t *vdiff = diff + 1024 + 256;
-  int r, c;
-
-  for (r = 0; r < 16; r++) {
-    for (c = 0; c < 16; c++)
-      udiff[c] = usrc[c] - upred[c];
-
-    udiff += 16;
-    upred += dst_stride;
-    usrc  += src_stride;
-  }
-
-  for (r = 0; r < 16; r++) {
-    for (c = 0; c < 16; c++)
-      vdiff[c] = vsrc[c] - vpred[c];
-
-    vdiff += 16;
-    vpred += dst_stride;
-    vsrc  += src_stride;
-  }
-}
-
-void vp9_subtract_sb64y_s_c(int16_t *diff, const uint8_t *src, int src_stride,
-                            const uint8_t *pred, int dst_stride) {
-  int r, c;
-
-  for (r = 0; r < 64; r++) {
-    for (c = 0; c < 64; c++) {
-      diff[c] = src[c] - pred[c];
-    }
-
-    diff += 64;
-    pred += dst_stride;
-    src  += src_stride;
-  }
-}
-
-void vp9_subtract_sb64uv_s_c(int16_t *diff, const uint8_t *usrc,
-                             const uint8_t *vsrc, int src_stride,
-                             const uint8_t *upred,
-                             const uint8_t *vpred, int dst_stride) {
-  int16_t *udiff = diff + 4096;
-  int16_t *vdiff = diff + 4096 + 1024;
-  int r, c;
-
-  for (r = 0; r < 32; r++) {
-    for (c = 0; c < 32; c++) {
-      udiff[c] = usrc[c] - upred[c];
-    }
-
-    udiff += 32;
-    upred += dst_stride;
-    usrc  += src_stride;
-  }
-
-  for (r = 0; r < 32; r++) {
-    for (c = 0; c < 32; c++) {
-      vdiff[c] = vsrc[c] - vpred[c];
-    }
-
-    vdiff += 32;
-    vpred += dst_stride;
-    vsrc  += src_stride;
-  }
-}
-
-void vp9_subtract_mby_c(int16_t *diff, uint8_t *src,
-                        uint8_t *pred, int stride) {
-  vp9_subtract_mby_s_c(diff, src, stride, pred, 16);
-}
-
-static void subtract_mb(MACROBLOCK *x) {
-  BLOCK *b = &x->block[0];
-
-  vp9_subtract_mby(x->src_diff, *(b->base_src), x->e_mbd.predictor,
-                   b->src_stride);
-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                    x->e_mbd.predictor, x->src.uv_stride);
-}
-
-void vp9_transform_mby_4x4(MACROBLOCK *x) {
+void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
   int i;
-  MACROBLOCKD *xd = &x->e_mbd;
 
-  for (i = 0; i < 16; i++) {
-    BLOCK *b = &x->block[i];
-    TX_TYPE tx_type = get_tx_type_4x4(xd, i);
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht4x4(b->src_diff, b->coeff, 16, tx_type);
-    } else if (!(i & 1) && get_tx_type_4x4(xd, i + 1) == DCT_DCT) {
-      x->fwd_txm8x4(x->block[i].src_diff, x->block[i].coeff, 32);
-      i++;
-    } else {
-      x->fwd_txm4x4(x->block[i].src_diff, x->block[i].coeff, 32);
-    }
-  }
+  for (i = 1; i < MAX_MB_PLANE; i++)
+    subtract_plane(x, bsize, i);
 }
 
-void vp9_transform_mbuv_4x4(MACROBLOCK *x) {
-  int i;
-
-  for (i = 16; i < 24; i += 2)
-    x->fwd_txm8x4(x->block[i].src_diff, x->block[i].coeff, 16);
+void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+  vp9_subtract_sby(x, bsize);
+  vp9_subtract_sbuv(x, bsize);
 }
 
-static void transform_mb_4x4(MACROBLOCK *x) {
-  vp9_transform_mby_4x4(x);
-  vp9_transform_mbuv_4x4(x);
-}
 
-void vp9_transform_mby_8x8(MACROBLOCK *x) {
-  int i;
-  MACROBLOCKD *xd = &x->e_mbd;
-  TX_TYPE tx_type;
-
-  for (i = 0; i < 9; i += 8) {
-    BLOCK *b = &x->block[i];
-    tx_type = get_tx_type_8x8(xd, i);
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht8x8(b->src_diff, b->coeff, 16, tx_type);
-    } else {
-      x->fwd_txm8x8(x->block[i].src_diff, x->block[i].coeff, 32);
-    }
-  }
-  for (i = 2; i < 11; i += 8) {
-    BLOCK *b = &x->block[i];
-    tx_type = get_tx_type_8x8(xd, i);
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht8x8(b->src_diff, (b + 2)->coeff, 16, tx_type);
-    } else {
-      x->fwd_txm8x8(x->block[i].src_diff, x->block[i + 2].coeff, 32);
-    }
-  }
-}
-
-void vp9_transform_mbuv_8x8(MACROBLOCK *x) {
-  int i;
-
-  for (i = 16; i < 24; i += 4)
-    x->fwd_txm8x8(x->block[i].src_diff, x->block[i].coeff, 16);
-}
-
-void vp9_transform_mb_8x8(MACROBLOCK *x) {
-  vp9_transform_mby_8x8(x);
-  vp9_transform_mbuv_8x8(x);
-}
-
-void vp9_transform_mby_16x16(MACROBLOCK *x) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  BLOCK *b = &x->block[0];
-  TX_TYPE tx_type = get_tx_type_16x16(xd, 0);
-  vp9_clear_system_state();
-  if (tx_type != DCT_DCT) {
-    vp9_short_fht16x16(b->src_diff, b->coeff, 16, tx_type);
-  } else {
-    x->fwd_txm16x16(x->block[0].src_diff, x->block[0].coeff, 32);
-  }
-}
-
-void vp9_transform_mb_16x16(MACROBLOCK *x) {
-  vp9_transform_mby_16x16(x);
-  vp9_transform_mbuv_8x8(x);
-}
-
-void vp9_transform_sby_32x32(MACROBLOCK *x) {
-  vp9_short_fdct32x32(x->src_diff, x->coeff, 64);
-}
-
-void vp9_transform_sby_16x16(MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
-
-  for (n = 0; n < 4; n++) {
-    const int x_idx = n & 1, y_idx = n >> 1;
-    const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 8 + x_idx) * 4);
-
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht16x16(x->src_diff + y_idx * 32 * 16 + x_idx * 16,
-                         x->coeff + n * 256, 32, tx_type);
-    } else {
-      x->fwd_txm16x16(x->src_diff + y_idx * 32 * 16 + x_idx * 16,
-                      x->coeff + n * 256, 64);
-    }
-  }
-}
-
-void vp9_transform_sby_8x8(MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
-
-  for (n = 0; n < 16; n++) {
-    const int x_idx = n & 3, y_idx = n >> 2;
-    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 8 + x_idx) * 2);
-
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht8x8(x->src_diff + y_idx * 32 * 8 + x_idx * 8,
-                       x->coeff + n * 64, 32, tx_type);
-    } else {
-      x->fwd_txm8x8(x->src_diff + y_idx * 32 * 8 + x_idx * 8,
-                    x->coeff + n * 64, 64);
-    }
-  }
-}
-
-void vp9_transform_sby_4x4(MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
-
-  for (n = 0; n < 64; n++) {
-    const int x_idx = n & 7, y_idx = n >> 3;
-    const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 8 + x_idx);
-
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht4x4(x->src_diff + y_idx * 32 * 4 + x_idx * 4,
-                       x->coeff + n * 16, 32, tx_type);
-    } else {
-      x->fwd_txm4x4(x->src_diff + y_idx * 32 * 4 + x_idx * 4,
-                    x->coeff + n * 16, 64);
-    }
-  }
-}
-
-void vp9_transform_sbuv_16x16(MACROBLOCK *x) {
-  vp9_clear_system_state();
-  x->fwd_txm16x16(x->src_diff + 1024, x->coeff + 1024, 32);
-  x->fwd_txm16x16(x->src_diff + 1280, x->coeff + 1280, 32);
-}
-
-void vp9_transform_sbuv_8x8(MACROBLOCK *x) {
-  int n;
-
-  vp9_clear_system_state();
-  for (n = 0; n < 4; n++) {
-    const int x_idx = n & 1, y_idx = n >> 1;
-
-    x->fwd_txm8x8(x->src_diff + 1024 + y_idx * 16 * 8 + x_idx * 8,
-                  x->coeff + 1024 + n * 64, 32);
-    x->fwd_txm8x8(x->src_diff + 1280 + y_idx * 16 * 8 + x_idx * 8,
-                  x->coeff + 1280 + n * 64, 32);
-  }
-}
-
-void vp9_transform_sbuv_4x4(MACROBLOCK *x) {
-  int n;
-
-  vp9_clear_system_state();
-  for (n = 0; n < 16; n++) {
-    const int x_idx = n & 3, y_idx = n >> 2;
-
-    x->fwd_txm4x4(x->src_diff + 1024 + y_idx * 16 * 4 + x_idx * 4,
-                  x->coeff + 1024 + n * 16, 32);
-    x->fwd_txm4x4(x->src_diff + 1280 + y_idx * 16 * 4 + x_idx * 4,
-                  x->coeff + 1280 + n * 16, 32);
-  }
-}
-
-void vp9_transform_sb64y_32x32(MACROBLOCK *x) {
-  int n;
-
-  for (n = 0; n < 4; n++) {
-    const int x_idx = n & 1, y_idx = n >> 1;
-
-    vp9_short_fdct32x32(x->src_diff + y_idx * 64 * 32 + x_idx * 32,
-                        x->coeff + n * 1024, 128);
-  }
-}
-
-void vp9_transform_sb64y_16x16(MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
-
-  for (n = 0; n < 16; n++) {
-    const int x_idx = n & 3, y_idx = n >> 2;
-    const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 16 + x_idx) * 4);
-
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht16x16(x->src_diff + y_idx * 64 * 16 + x_idx * 16,
-                         x->coeff + n * 256, 64, tx_type);
-    } else {
-      x->fwd_txm16x16(x->src_diff + y_idx * 64 * 16 + x_idx * 16,
-                      x->coeff + n * 256, 128);
-    }
-  }
-}
-
-void vp9_transform_sb64y_8x8(MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
-
-  for (n = 0; n < 64; n++) {
-    const int x_idx = n & 7, y_idx = n >> 3;
-    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 16 + x_idx) * 2);
-
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht8x8(x->src_diff + y_idx * 64 * 8 + x_idx * 8,
-                         x->coeff + n * 64, 64, tx_type);
-    } else {
-      x->fwd_txm8x8(x->src_diff + y_idx * 64 * 8 + x_idx * 8,
-                    x->coeff + n * 64, 128);
-    }
-  }
-}
-
-void vp9_transform_sb64y_4x4(MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
-
-  for (n = 0; n < 256; n++) {
-    const int x_idx = n & 15, y_idx = n >> 4;
-    const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 16 + x_idx);
-
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht8x8(x->src_diff + y_idx * 64 * 4 + x_idx * 4,
-                       x->coeff + n * 16, 64, tx_type);
-    } else {
-      x->fwd_txm4x4(x->src_diff + y_idx * 64 * 4 + x_idx * 4,
-                    x->coeff + n * 16, 128);
-    }
-  }
-}
-
-void vp9_transform_sb64uv_32x32(MACROBLOCK *x) {
-  vp9_clear_system_state();
-  vp9_short_fdct32x32(x->src_diff + 4096,
-                      x->coeff + 4096, 64);
-  vp9_short_fdct32x32(x->src_diff + 4096 + 1024,
-                      x->coeff + 4096 + 1024, 64);
-}
-
-void vp9_transform_sb64uv_16x16(MACROBLOCK *x) {
-  int n;
-
-  vp9_clear_system_state();
-  for (n = 0; n < 4; n++) {
-    const int x_idx = n & 1, y_idx = n >> 1;
-
-    x->fwd_txm16x16(x->src_diff + 4096 + y_idx * 32 * 16 + x_idx * 16,
-                    x->coeff + 4096 + n * 256, 64);
-    x->fwd_txm16x16(x->src_diff + 4096 + 1024 + y_idx * 32 * 16 + x_idx * 16,
-                    x->coeff + 4096 + 1024 + n * 256, 64);
-  }
-}
-
-void vp9_transform_sb64uv_8x8(MACROBLOCK *x) {
-  int n;
-
-  vp9_clear_system_state();
-  for (n = 0; n < 16; n++) {
-    const int x_idx = n & 3, y_idx = n >> 2;
-
-    x->fwd_txm8x8(x->src_diff + 4096 + y_idx * 32 * 8 + x_idx * 8,
-                  x->coeff + 4096 + n * 64, 64);
-    x->fwd_txm8x8(x->src_diff + 4096 + 1024 + y_idx * 32 * 8 + x_idx * 8,
-                  x->coeff + 4096 + 1024 + n * 64, 64);
-  }
-}
-
-void vp9_transform_sb64uv_4x4(MACROBLOCK *x) {
-  int n;
-
-  vp9_clear_system_state();
-  for (n = 0; n < 64; n++) {
-    const int x_idx = n & 7, y_idx = n >> 3;
-
-    x->fwd_txm4x4(x->src_diff + 4096 + y_idx * 32 * 4 + x_idx * 4,
-                  x->coeff + 4096 + n * 16, 64);
-    x->fwd_txm4x4(x->src_diff + 4096 + 1024 + y_idx * 32 * 4 + x_idx * 4,
-                  x->coeff + 4096 + 1024 + n * 16, 64);
-  }
-}
-
 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 #define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 typedef struct vp9_token_state vp9_token_state;
@@ -533,126 +107,84 @@
                                      int idx, int token,
                                      uint8_t *token_cache,
                                      int pad, int l) {
-  int bak = token_cache[idx], pt;
-  token_cache[idx] = token;
+  int bak = token_cache[scan[idx]], pt;
+  token_cache[scan[idx]] = vp9_pt_energy_class[token];
   pt = vp9_get_coef_context(scan, nb, pad, token_cache, idx + 1, l);
-  token_cache[idx] = bak;
+  token_cache[scan[idx]] = bak;
   return pt;
 }
 
-static void optimize_b(VP9_COMMON *const cm,
-                       MACROBLOCK *mb, int ib, PLANE_TYPE type,
-                       const int16_t *dequant_ptr,
+static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
+                       int plane, int block, BLOCK_SIZE_TYPE bsize,
                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                       int tx_size) {
-  const int ref = mb->e_mbd.mode_info_context->mbmi.ref_frame != INTRA_FRAME;
+                       TX_SIZE tx_size) {
+  const int ref = mb->e_mbd.mode_info_context->mbmi.ref_frame[0] != INTRA_FRAME;
   MACROBLOCKD *const xd = &mb->e_mbd;
   vp9_token_state tokens[1025][2];
   unsigned best_index[1025][2];
-  const int16_t *coeff_ptr = mb->coeff + ib * 16;
-  int16_t *qcoeff_ptr = xd->qcoeff + ib * 16;
-  int16_t *dqcoeff_ptr = xd->dqcoeff + ib * 16;
-  int eob = xd->eobs[ib], final_eob, sz = 0;
+  const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff,
+                                          block, 16);
+  int16_t *qcoeff_ptr;
+  int16_t *dqcoeff_ptr;
+  int eob = xd->plane[plane].eobs[block], final_eob, sz = 0;
   const int i0 = 0;
   int rc, x, next, i;
   int64_t rdmult, rddiv, rd_cost0, rd_cost1;
   int rate0, rate1, error0, error1, t0, t1;
   int best, band, pt;
+  PLANE_TYPE type = xd->plane[plane].plane_type;
   int err_mult = plane_rd_mult[type];
   int default_eob, pad;
   int const *scan, *nb;
   const int mul = 1 + (tx_size == TX_32X32);
   uint8_t token_cache[1024];
-#if CONFIG_CODE_NONZEROCOUNT
-  // TODO(debargha): the dynamic programming approach used in this function
-  // is not compatible with the true rate cost when nzcs are used. Note
-  // the total rate is the sum of the nzc rate and the indicvidual token
-  // rates. The latter part can be optimized in this function, but because
-  // the nzc rate is a function of all the other tokens without a Markov
-  // relationship this rate cannot be considered correctly.
-  // The current implementation uses a suboptimal approach to account for
-  // the nzc rates somewhat, but in reality the optimization approach needs
-  // to change substantially.
-  uint16_t nzc = xd->nzcs[ib];
-  uint16_t nzc0, nzc1;
-  uint16_t final_nzc = 0, final_nzc_exp;
-  int nzc_context = vp9_get_nzc_context(cm, xd, ib);
-  unsigned int *nzc_cost;
-  nzc0 = nzc1 = nzc;
-#endif
+  const int ib = txfrm_block_to_raster_block(xd, bsize, plane,
+                                             block, 2 * tx_size);
+  const int16_t *dequant_ptr = xd->plane[plane].dequant;
+  const uint8_t * band_translate;
 
+  assert((!type && !plane) || (type && plane));
+  dqcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16);
+  qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
   switch (tx_size) {
     default:
     case TX_4X4: {
-      const TX_TYPE tx_type = get_tx_type_4x4(xd, ib);
+      const TX_TYPE tx_type = plane == 0 ? get_tx_type_4x4(xd, ib) : DCT_DCT;
       default_eob = 16;
-#if CONFIG_CODE_NONZEROCOUNT
-      nzc_cost = mb->nzc_costs_4x4[nzc_context][ref][type];
-#endif
-      if (tx_type == DCT_ADST) {
-        scan = vp9_col_scan_4x4;
-      } else if (tx_type == ADST_DCT) {
-        scan = vp9_row_scan_4x4;
-      } else {
-        scan = vp9_default_zig_zag1d_4x4;
-      }
+      scan = get_scan_4x4(tx_type);
+      band_translate = vp9_coefband_trans_4x4;
       break;
     }
     case TX_8X8: {
-      const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
-      const int sz = 3 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x;
-      const TX_TYPE tx_type = get_tx_type_8x8(xd, y + (x >> 1));
-      if (tx_type == DCT_ADST) {
-        scan = vp9_col_scan_8x8;
-      } else if (tx_type == ADST_DCT) {
-        scan = vp9_row_scan_8x8;
-      } else {
-        scan = vp9_default_zig_zag1d_8x8;
-      }
+      const TX_TYPE tx_type = plane == 0 ? get_tx_type_8x8(xd, ib) : DCT_DCT;
+      scan = get_scan_8x8(tx_type);
       default_eob = 64;
-#if CONFIG_CODE_NONZEROCOUNT
-      nzc_cost = mb->nzc_costs_8x8[nzc_context][ref][type];
-#endif
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_16X16: {
-      const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
-      const int sz = 4 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x;
-      const TX_TYPE tx_type = get_tx_type_16x16(xd, y + (x >> 2));
-      if (tx_type == DCT_ADST) {
-        scan = vp9_col_scan_16x16;
-      } else if (tx_type == ADST_DCT) {
-        scan = vp9_row_scan_16x16;
-      } else {
-        scan = vp9_default_zig_zag1d_16x16;
-      }
+      const TX_TYPE tx_type = plane == 0 ? get_tx_type_16x16(xd, ib) : DCT_DCT;
+      scan = get_scan_16x16(tx_type);
       default_eob = 256;
-#if CONFIG_CODE_NONZEROCOUNT
-      nzc_cost = mb->nzc_costs_16x16[nzc_context][ref][type];
-#endif
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_32X32:
-      scan = vp9_default_zig_zag1d_32x32;
+      scan = vp9_default_scan_32x32;
       default_eob = 1024;
-#if CONFIG_CODE_NONZEROCOUNT
-      nzc_cost = mb->nzc_costs_32x32[nzc_context][ref][type];
-#endif
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
   }
+  assert(eob <= default_eob);
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
   rdmult = mb->rdmult * err_mult;
-  if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+  if (mb->e_mbd.mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
     rdmult = (rdmult * 9) >> 4;
   rddiv = mb->rddiv;
   memset(best_index, 0, sizeof(best_index));
   /* Initialize the sentinel node of the trellis. */
-#if CONFIG_CODE_NONZEROCOUNT
-  tokens[eob][0].rate = nzc_cost[nzc];
-#else
   tokens[eob][0].rate = 0;
-#endif
   tokens[eob][0].error = 0;
   tokens[eob][0].next = default_eob;
   tokens[eob][0].token = DCT_EOB_TOKEN;
@@ -660,14 +192,12 @@
   *(tokens[eob] + 1) = *(tokens[eob] + 0);
   next = eob;
   for (i = 0; i < eob; i++)
-    token_cache[i] = vp9_dct_value_tokens_ptr[qcoeff_ptr[scan[i]]].Token;
+    token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
+        qcoeff_ptr[scan[i]]].token];
   nb = vp9_get_coef_neighbors_handle(scan, &pad);
 
   for (i = eob; i-- > i0;) {
     int base_bits, d2, dx;
-#if CONFIG_CODE_NONZEROCOUNT
-    int new_nzc0, new_nzc1;
-#endif
 
     rc = scan[i];
     x = qcoeff_ptr[rc];
@@ -679,16 +209,18 @@
       /* Evaluate the first possibility for this state. */
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
-      t0 = (vp9_dct_value_tokens_ptr + x)->Token;
+      t0 = (vp9_dct_value_tokens_ptr + x)->token;
       /* Consider both possible successor states. */
       if (next < default_eob) {
-        band = get_coef_band(scan, tx_size, i + 1);
+        band = get_coef_band(band_translate, i + 1);
         pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
                                        pad, default_eob);
         rate0 +=
-          mb->token_costs[tx_size][type][ref][band][pt][tokens[next][0].token];
+          mb->token_costs_noskip[tx_size][type][ref][band][pt]
+                                [tokens[next][0].token];
         rate1 +=
-          mb->token_costs[tx_size][type][ref][band][pt][tokens[next][1].token];
+          mb->token_costs_noskip[tx_size][type][ref][band][pt]
+                                [tokens[next][1].token];
       }
       UPDATE_RD_COST();
       /* And pick the best. */
@@ -702,9 +234,6 @@
       tokens[i][0].token = t0;
       tokens[i][0].qc = x;
       best_index[i][0] = best;
-#if CONFIG_CODE_NONZEROCOUNT
-      new_nzc0 = (best ? nzc1 : nzc0);
-#endif
 
       /* Evaluate the second possibility for this state. */
       rate0 = tokens[next][0].rate;
@@ -731,28 +260,30 @@
              DCT_EOB_TOKEN : ZERO_TOKEN;
         t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
              DCT_EOB_TOKEN : ZERO_TOKEN;
-#if CONFIG_CODE_NONZEROCOUNT
-        // Account for rate drop because of the nzc change.
-        // TODO(debargha): Find a better solution
-        rate0 -= nzc_cost[nzc0] - nzc_cost[nzc0 - 1];
-        rate1 -= nzc_cost[nzc1] - nzc_cost[nzc1 - 1];
-#endif
       } else {
-        t0 = t1 = (vp9_dct_value_tokens_ptr + x)->Token;
+        t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;
       }
       if (next < default_eob) {
-        band = get_coef_band(scan, tx_size, i + 1);
+        band = get_coef_band(band_translate, i + 1);
         if (t0 != DCT_EOB_TOKEN) {
           pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
                                          pad, default_eob);
-          rate0 += mb->token_costs[tx_size][type][ref][band][pt][
-              tokens[next][0].token];
+          if (!x)
+            rate0 += mb->token_costs[tx_size][type][ref][band][pt][
+                tokens[next][0].token];
+          else
+            rate0 += mb->token_costs_noskip[tx_size][type][ref][band][pt][
+                tokens[next][0].token];
         }
         if (t1 != DCT_EOB_TOKEN) {
           pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache,
                                          pad, default_eob);
-          rate1 += mb->token_costs[tx_size][type][ref][band][pt][
-              tokens[next][1].token];
+          if (!x)
+            rate1 += mb->token_costs[tx_size][type][ref][band][pt][
+                tokens[next][1].token];
+          else
+            rate1 += mb->token_costs_noskip[tx_size][type][ref][band][pt][
+                tokens[next][1].token];
         }
       }
 
@@ -771,11 +302,6 @@
       tokens[i][1].token = best ? t1 : t0;
       tokens[i][1].qc = x;
       best_index[i][1] = best;
-#if CONFIG_CODE_NONZEROCOUNT
-      new_nzc1 = (best ? nzc1 : nzc0) - (!x);
-      nzc0 = new_nzc0;
-      nzc1 = new_nzc1;
-#endif
       /* Finally, make this the new head of the trellis. */
       next = i;
     }
@@ -783,7 +309,7 @@
      *  add a new trellis node, but we do need to update the costs.
      */
     else {
-      band = get_coef_band(scan, tx_size, i + 1);
+      band = get_coef_band(band_translate, i + 1);
       t0 = tokens[next][0].token;
       t1 = tokens[next][1].token;
       /* Update the cost of each path if we're past the EOB token. */
@@ -802,8 +328,8 @@
   }
 
   /* Now pick the best path through the whole trellis. */
-  band = get_coef_band(scan, tx_size, i + 1);
-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+  band = get_coef_band(band_translate, i + 1);
+  pt = combine_entropy_contexts(*a, *l);
   rate0 = tokens[next][0].rate;
   rate1 = tokens[next][1].rate;
   error0 = tokens[next][0].error;
@@ -810,21 +336,17 @@
   error1 = tokens[next][1].error;
   t0 = tokens[next][0].token;
   t1 = tokens[next][1].token;
-  rate0 += mb->token_costs[tx_size][type][ref][band][pt][t0];
-  rate1 += mb->token_costs[tx_size][type][ref][band][pt][t1];
+  rate0 += mb->token_costs_noskip[tx_size][type][ref][band][pt][t0];
+  rate1 += mb->token_costs_noskip[tx_size][type][ref][band][pt][t1];
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
-#if CONFIG_CODE_NONZEROCOUNT
-  final_nzc_exp = (best ? nzc1 : nzc0);
-#endif
   final_eob = i0 - 1;
+  vpx_memset(qcoeff_ptr, 0, sizeof(*qcoeff_ptr) * (16 << (tx_size * 2)));
+  vpx_memset(dqcoeff_ptr, 0, sizeof(*dqcoeff_ptr) * (16 << (tx_size * 2)));
   for (i = next; i < eob; i = next) {
     x = tokens[i][best].qc;
     if (x) {
       final_eob = i;
-#if CONFIG_CODE_NONZEROCOUNT
-      ++final_nzc;
-#endif
     }
     rc = scan[i];
     qcoeff_ptr[rc] = x;
@@ -835,519 +357,338 @@
   }
   final_eob++;
 
-  xd->eobs[ib] = final_eob;
+  xd->plane[plane].eobs[block] = final_eob;
   *a = *l = (final_eob > 0);
-#if CONFIG_CODE_NONZEROCOUNT
-  assert(final_nzc == final_nzc_exp);
-  xd->nzcs[ib] = final_nzc;
-#endif
 }
 
-void vp9_optimize_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
-  int b;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta;
-  ENTROPY_CONTEXT *tl;
+struct optimize_block_args {
+  VP9_COMMON *cm;
+  MACROBLOCK *x;
+  struct optimize_ctx *ctx;
+};
 
-  if (!x->e_mbd.above_context || !x->e_mbd.left_context)
-    return;
+void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                    int ss_txfrm_size, VP9_COMMON *cm, MACROBLOCK *mb,
+                    struct optimize_ctx *ctx) {
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  int x, y;
 
-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  // find current entropy context
+  txfrm_block_to_raster_xy(xd, bsize, plane, block, ss_txfrm_size, &x, &y);
 
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-
-  for (b = 0; b < 16; b++) {
-    optimize_b(cm, x, b, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[b].dequant,
-               ta + vp9_block2above[TX_4X4][b],
-               tl + vp9_block2left[TX_4X4][b], TX_4X4);
-  }
+  optimize_b(cm, mb, plane, block, bsize,
+             &ctx->ta[plane][x], &ctx->tl[plane][y], ss_txfrm_size / 2);
 }
 
-void vp9_optimize_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
-  int b;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta;
-  ENTROPY_CONTEXT *tl;
-
-  if (!x->e_mbd.above_context || !x->e_mbd.left_context)
-    return;
-
-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-
-  for (b = 16; b < 24; b++) {
-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[b].dequant,
-               ta + vp9_block2above[TX_4X4][b],
-               tl + vp9_block2left[TX_4X4][b], TX_4X4);
-  }
+static void optimize_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                           int ss_txfrm_size, void *arg) {
+  const struct optimize_block_args* const args = arg;
+  vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, args->x,
+                 args->ctx);
 }
 
-static void optimize_mb_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
-  vp9_optimize_mby_4x4(cm, x);
-  vp9_optimize_mbuv_4x4(cm, x);
-}
+void vp9_optimize_init(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
+                       struct optimize_ctx *ctx) {
+  int p;
 
-void vp9_optimize_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
-  int b;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta;
-  ENTROPY_CONTEXT *tl;
+  for (p = 0; p < MAX_MB_PLANE; p++) {
+    const struct macroblockd_plane* const plane = &xd->plane[p];
+    const int bwl = b_width_log2(bsize) - plane->subsampling_x;
+    const int bhl = b_height_log2(bsize) - plane->subsampling_y;
+    const MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+    const TX_SIZE tx_size = p ? get_uv_tx_size(mbmi)
+                              : mbmi->txfm_size;
+    int i, j;
 
-  if (!x->e_mbd.above_context || !x->e_mbd.left_context)
-    return;
-
-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-  for (b = 0; b < 16; b += 4) {
-    ENTROPY_CONTEXT *const a = ta + vp9_block2above[TX_8X8][b];
-    ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b];
-    ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;
-    ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;
-    optimize_b(cm, x, b, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[b].dequant,
-               &above_ec, &left_ec, TX_8X8);
-    a[1] = a[0] = above_ec;
-    l[1] = l[0] = left_ec;
+    for (i = 0; i < 1 << bwl; i += 1 << tx_size) {
+      int c = 0;
+      ctx->ta[p][i] = 0;
+      for (j = 0; j < 1 << tx_size && !c; j++) {
+        c = ctx->ta[p][i] |= plane->above_context[i + j];
+      }
+    }
+    for (i = 0; i < 1 << bhl; i += 1 << tx_size) {
+      int c = 0;
+      ctx->tl[p][i] = 0;
+      for (j = 0; j < 1 << tx_size && !c; j++) {
+        c = ctx->tl[p][i] |= plane->left_context[i + j];
+      }
+    }
   }
 }
 
-void vp9_optimize_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
-  int b;
-  ENTROPY_CONTEXT *const ta = (ENTROPY_CONTEXT *)x->e_mbd.above_context;
-  ENTROPY_CONTEXT *const tl = (ENTROPY_CONTEXT *)x->e_mbd.left_context;
-
-  if (!ta || !tl)
-    return;
-
-  for (b = 16; b < 24; b += 4) {
-    ENTROPY_CONTEXT *const a = ta + vp9_block2above[TX_8X8][b];
-    ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b];
-    ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;
-    ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;
-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[b].dequant,
-               &above_ec, &left_ec, TX_8X8);
-  }
+void vp9_optimize_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+  struct optimize_ctx ctx;
+  struct optimize_block_args arg = {cm, x, &ctx};
+  vp9_optimize_init(&x->e_mbd, bsize, &ctx);
+  foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0, optimize_block, &arg);
 }
 
-static void optimize_mb_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
-  vp9_optimize_mby_8x8(cm, x);
-  vp9_optimize_mbuv_8x8(cm, x);
+void vp9_optimize_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
+                       BLOCK_SIZE_TYPE bsize) {
+  struct optimize_ctx ctx;
+  struct optimize_block_args arg = {cm, x, &ctx};
+  vp9_optimize_init(&x->e_mbd, bsize, &ctx);
+  foreach_transformed_block_uv(&x->e_mbd, bsize, optimize_block, &arg);
 }
 
-void vp9_optimize_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
-  ENTROPY_CONTEXT_PLANES *const t_above = x->e_mbd.above_context;
-  ENTROPY_CONTEXT_PLANES *const t_left = x->e_mbd.left_context;
-  ENTROPY_CONTEXT ta, tl;
+struct encode_b_args {
+  VP9_COMMON *cm;
+  MACROBLOCK *x;
+  struct optimize_ctx *ctx;
+};
 
-  if (!t_above || !t_left)
-    return;
+static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                         int ss_txfrm_size, void *arg) {
+  struct encode_b_args* const args = arg;
+  MACROBLOCK* const x = args->x;
+  MACROBLOCKD* const xd = &x->e_mbd;
+  const int bw = plane_block_width(bsize, &xd->plane[plane]);
+  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
+                                                       block, ss_txfrm_size);
+  int16_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block, 16);
+  int16_t *const src_diff = raster_block_offset_int16(xd, bsize, plane,
+                                                      raster_block,
+                                                      x->plane[plane].src_diff);
+  TX_TYPE tx_type = DCT_DCT;
 
-  ta = (t_above->y1[0] + t_above->y1[1] + t_above->y1[2] + t_above->y1[3]) != 0;
-  tl = (t_left->y1[0] + t_left->y1[1] + t_left->y1[2] + t_left->y1[3]) != 0;
-  optimize_b(cm, x, 0, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
-             &ta, &tl, TX_16X16);
-}
-
-static void optimize_mb_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
-  vp9_optimize_mby_16x16(cm, x);
-  vp9_optimize_mbuv_8x8(cm, x);
-}
-
-void vp9_optimize_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {
-  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
-  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
-  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
-  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);
-  ENTROPY_CONTEXT ta, tl;
-
-  ta = (a[0] + a[1] + a[2] + a[3] + a1[0] + a1[1] + a1[2] + a1[3]) != 0;
-  tl = (l[0] + l[1] + l[2] + l[3] + l1[0] + l1[1] + l1[2] + l1[3]) != 0;
-  optimize_b(cm, x, 0, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
-             &ta, &tl, TX_32X32);
-}
-
-void vp9_optimize_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
-  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
-  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
-  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
-  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);
-  ENTROPY_CONTEXT ta[2], tl[2];
-  int n;
-
-  ta[0] = (a[0] + a[1] + a[2] + a[3]) != 0;
-  ta[1] = (a1[0] + a1[1] + a1[2] + a1[3]) != 0;
-  tl[0] = (l[0] + l[1] + l[2] + l[3]) != 0;
-  tl[1] = (l1[0] + l1[1] + l1[2] + l1[3]) != 0;
-  for (n = 0; n < 4; n++) {
-    const int x_idx = n & 1, y_idx = n >> 1;
-
-    optimize_b(cm, x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
-               ta + x_idx, tl + y_idx, TX_16X16);
+  switch (ss_txfrm_size / 2) {
+    case TX_32X32:
+      vp9_short_fdct32x32(src_diff, coeff, bw * 2);
+      break;
+    case TX_16X16:
+      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
+      if (tx_type != DCT_DCT)
+        vp9_short_fht16x16(src_diff, coeff, bw, tx_type);
+      else
+        x->fwd_txm16x16(src_diff, coeff, bw * 2);
+      break;
+    case TX_8X8:
+      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
+      if (tx_type != DCT_DCT)
+        vp9_short_fht8x8(src_diff, coeff, bw, tx_type);
+      else
+        x->fwd_txm8x8(src_diff, coeff, bw * 2);
+      break;
+    case TX_4X4:
+      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
+      if (tx_type != DCT_DCT)
+        vp9_short_fht4x4(src_diff, coeff, bw, tx_type);
+      else
+        x->fwd_txm4x4(src_diff, coeff, bw * 2);
+      break;
+    default:
+      assert(0);
   }
-}
 
-void vp9_optimize_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
-  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
-  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
-  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
-  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);
-  ENTROPY_CONTEXT ta[4], tl[4];
-  int n;
-
-  ta[0] = (a[0] + a[1]) != 0;
-  ta[1] = (a[2] + a[3]) != 0;
-  ta[2] = (a1[0] + a1[1]) != 0;
-  ta[3] = (a1[2] + a1[3]) != 0;
-  tl[0] = (l[0] + l[1]) != 0;
-  tl[1] = (l[2] + l[3]) != 0;
-  tl[2] = (l1[0] + l1[1]) != 0;
-  tl[3] = (l1[2] + l1[3]) != 0;
-  for (n = 0; n < 16; n++) {
-    const int x_idx = n & 3, y_idx = n >> 2;
-
-    optimize_b(cm, x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
-               ta + x_idx, tl + y_idx, TX_8X8);
-  }
+  vp9_quantize(x, plane, block, 16 << ss_txfrm_size, tx_type);
 }
 
-void vp9_optimize_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
-  ENTROPY_CONTEXT ta[8], tl[8];
-  int n;
+static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                         int ss_txfrm_size, void *arg) {
+  struct encode_b_args *const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
+                                                       block, ss_txfrm_size);
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
+  uint8_t *const dst = raster_block_offset_uint8(xd, bsize, plane,
+                                                 raster_block,
+                                                 pd->dst.buf, pd->dst.stride);
+  TX_TYPE tx_type = DCT_DCT;
 
-  vpx_memcpy(ta, x->e_mbd.above_context, 4 * sizeof(ENTROPY_CONTEXT));
-  vpx_memcpy(ta + 4, x->e_mbd.above_context + 1, 4 * sizeof(ENTROPY_CONTEXT));
-  vpx_memcpy(tl, x->e_mbd.left_context, 4 * sizeof(ENTROPY_CONTEXT));
-  vpx_memcpy(tl + 4, x->e_mbd.left_context + 1, 4 * sizeof(ENTROPY_CONTEXT));
-  for (n = 0; n < 64; n++) {
-    const int x_idx = n & 7, y_idx = n >> 3;
+  xform_quant(plane, block, bsize, ss_txfrm_size, arg);
 
-    optimize_b(cm, x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
-               ta + x_idx, tl + y_idx, TX_4X4);
-  }
-}
+  if (x->optimize)
+    vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, x, args->ctx);
 
-void vp9_optimize_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
-  ENTROPY_CONTEXT *a, *l, *a1, *l1, above_ec, left_ec;
-  int b;
-
-  for (b = 64; b < 96; b += 16) {
-    const int cidx = b >= 80 ? 20 : 16;
-    a = ta + vp9_block2above_sb[TX_16X16][b];
-    l = tl + vp9_block2left_sb[TX_16X16][b];
-    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    above_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
-    left_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
-               &above_ec, &left_ec, TX_16X16);
+  switch (ss_txfrm_size / 2) {
+    case TX_32X32:
+      vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
+      break;
+    case TX_16X16:
+      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
+      if (tx_type == DCT_DCT)
+        vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);
+      else
+        vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type);
+      break;
+    case TX_8X8:
+      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
+      if (tx_type == DCT_DCT)
+        vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
+      else
+        vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type);
+      break;
+    case TX_4X4:
+      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
+      if (tx_type == DCT_DCT)
+        // this is like vp9_short_idct4x4 but has a special case around eob<=1
+        // which is significant (not just an optimization) for the lossless
+        // case.
+        vp9_inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff,
+                                        dst, pd->dst.stride);
+      else
+        vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);
+      break;
   }
 }
 
-void vp9_optimize_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;
-  ENTROPY_CONTEXT *a, *l, above_ec, left_ec;
-  int b;
+void vp9_xform_quant_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct encode_b_args arg = {cm, x, NULL};
 
-  vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above));
-  vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left));
-  for (b = 64; b < 96; b += 4) {
-    const int cidx = b >= 80 ? 20 : 16;
-    a = ta + vp9_block2above_sb[TX_8X8][b];
-    l = tl + vp9_block2left_sb[TX_8X8][b];
-    above_ec = (a[0] + a[1]) != 0;
-    left_ec = (l[0] + l[1]) != 0;
-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
-               &above_ec, &left_ec, TX_8X8);
-    a[0] = a[1] = above_ec;
-    l[0] = l[1] = left_ec;
-  }
+  foreach_transformed_block_in_plane(xd, bsize, 0, xform_quant, &arg);
 }
 
-void vp9_optimize_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;
-  ENTROPY_CONTEXT *a, *l;
-  int b;
+void vp9_xform_quant_sbuv(VP9_COMMON *cm, MACROBLOCK *x,
+                          BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct encode_b_args arg = {cm, x, NULL};
 
-  vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above));
-  vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left));
-  for (b = 64; b < 96; b++) {
-    const int cidx = b >= 80 ? 20 : 16;
-    a = ta + vp9_block2above_sb[TX_4X4][b];
-    l = tl + vp9_block2left_sb[TX_4X4][b];
-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
-               a, l, TX_4X4);
-  }
+  foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);
 }
 
-void vp9_optimize_sb64y_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {
-  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
-  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
-  ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2);
-  ENTROPY_CONTEXT *a3 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 3);
-  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
-  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);
-  ENTROPY_CONTEXT *l2 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 2);
-  ENTROPY_CONTEXT *l3 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 3);
-  ENTROPY_CONTEXT ta[2], tl[2];
-  int n;
+void vp9_encode_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  struct encode_b_args arg = {cm, x, &ctx};
 
-  ta[0] = (a[0] + a[1] + a[2] + a[3] + a1[0] + a1[1] + a1[2] + a1[3]) != 0;
-  ta[1] = (a2[0] + a2[1] + a2[2] + a2[3] + a3[0] + a3[1] + a3[2] + a3[3]) != 0;
-  tl[0] = (l[0] + l[1] + l[2] + l[3] + l1[0] + l1[1] + l1[2] + l1[3]) != 0;
-  tl[1] = (l2[0] + l2[1] + l2[2] + l2[3] + l3[0] + l3[1] + l3[2] + l3[3]) != 0;
-  for (n = 0; n < 4; n++) {
-    const int x_idx = n & 1, y_idx = n >> 1;
+  vp9_subtract_sby(x, bsize);
+  if (x->optimize)
+    vp9_optimize_init(xd, bsize, &ctx);
 
-    optimize_b(cm, x, n * 64, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
-               ta + x_idx, tl + y_idx, TX_32X32);
-  }
+  foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg);
 }
 
-void vp9_optimize_sb64y_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
-  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
-  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
-  ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2);
-  ENTROPY_CONTEXT *a3 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 3);
-  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
-  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);
-  ENTROPY_CONTEXT *l2 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 2);
-  ENTROPY_CONTEXT *l3 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 3);
-  ENTROPY_CONTEXT ta[4], tl[4];
-  int n;
+void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  struct encode_b_args arg = {cm, x, &ctx};
 
-  ta[0] = (a[0] + a[1] + a[2] + a[3]) != 0;
-  ta[1] = (a1[0] + a1[1] + a1[2] + a1[3]) != 0;
-  ta[2] = (a2[0] + a2[1] + a2[2] + a2[3]) != 0;
-  ta[3] = (a3[0] + a3[1] + a3[2] + a3[3]) != 0;
-  tl[0] = (l[0] + l[1] + l[2] + l[3]) != 0;
-  tl[1] = (l1[0] + l1[1] + l1[2] + l1[3]) != 0;
-  tl[2] = (l2[0] + l2[1] + l2[2] + l2[3]) != 0;
-  tl[3] = (l3[0] + l3[1] + l3[2] + l3[3]) != 0;
-  for (n = 0; n < 16; n++) {
-    const int x_idx = n & 3, y_idx = n >> 2;
+  vp9_subtract_sbuv(x, bsize);
+  if (x->optimize)
+    vp9_optimize_init(xd, bsize, &ctx);
 
-    optimize_b(cm, x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
-               ta + x_idx, tl + y_idx, TX_16X16);
-  }
+  foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
 }
 
-void vp9_optimize_sb64y_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
-  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
-  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);
-  ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2);
-  ENTROPY_CONTEXT *a3 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 3);
-  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
-  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);
-  ENTROPY_CONTEXT *l2 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 2);
-  ENTROPY_CONTEXT *l3 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 3);
-  ENTROPY_CONTEXT ta[8], tl[8];
-  int n;
+void vp9_encode_sb(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  struct encode_b_args arg = {cm, x, &ctx};
 
-  ta[0] = (a[0] + a[1]) != 0;
-  ta[1] = (a[2] + a[3]) != 0;
-  ta[2] = (a1[0] + a1[1]) != 0;
-  ta[3] = (a1[2] + a1[3]) != 0;
-  ta[4] = (a2[0] + a2[1]) != 0;
-  ta[5] = (a2[2] + a2[3]) != 0;
-  ta[6] = (a3[0] + a3[1]) != 0;
-  ta[7] = (a3[2] + a3[3]) != 0;
-  tl[0] = (l[0] + l[1]) != 0;
-  tl[1] = (l[2] + l[3]) != 0;
-  tl[2] = (l1[0] + l1[1]) != 0;
-  tl[3] = (l1[2] + l1[3]) != 0;
-  tl[4] = (l2[0] + l2[1]) != 0;
-  tl[5] = (l2[2] + l2[3]) != 0;
-  tl[6] = (l3[0] + l3[1]) != 0;
-  tl[7] = (l3[2] + l3[3]) != 0;
-  for (n = 0; n < 64; n++) {
-    const int x_idx = n & 7, y_idx = n >> 3;
+  vp9_subtract_sb(x, bsize);
+  if (x->optimize)
+    vp9_optimize_init(xd, bsize, &ctx);
 
-    optimize_b(cm, x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
-               ta + x_idx, tl + y_idx, TX_8X8);
-  }
+  foreach_transformed_block(xd, bsize, encode_block, &arg);
 }
 
-void vp9_optimize_sb64y_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
-  ENTROPY_CONTEXT ta[16], tl[16];
-  int n;
+static void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                               int ss_txfrm_size, void *arg) {
+  struct encode_b_args* const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
+  const int bw = plane_block_width(bsize, pd);
+  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
+                                                       block, ss_txfrm_size);
 
-  vpx_memcpy(ta, x->e_mbd.above_context, 4 * sizeof(ENTROPY_CONTEXT));
-  vpx_memcpy(ta + 4, x->e_mbd.above_context + 1, 4 * sizeof(ENTROPY_CONTEXT));
-  vpx_memcpy(ta + 8, x->e_mbd.above_context + 2, 4 * sizeof(ENTROPY_CONTEXT));
-  vpx_memcpy(ta + 12, x->e_mbd.above_context + 3, 4 * sizeof(ENTROPY_CONTEXT));
-  vpx_memcpy(tl, x->e_mbd.left_context, 4 * sizeof(ENTROPY_CONTEXT));
-  vpx_memcpy(tl + 4, x->e_mbd.left_context + 1, 4 * sizeof(ENTROPY_CONTEXT));
-  vpx_memcpy(tl + 8, x->e_mbd.left_context + 2, 4 * sizeof(ENTROPY_CONTEXT));
-  vpx_memcpy(tl + 12, x->e_mbd.left_context + 3, 4 * sizeof(ENTROPY_CONTEXT));
-  for (n = 0; n < 256; n++) {
-    const int x_idx = n & 15, y_idx = n >> 4;
+  uint8_t *const src = raster_block_offset_uint8(xd, bsize, plane, raster_block,
+                                                 p->src.buf, p->src.stride);
+  uint8_t *const dst = raster_block_offset_uint8(xd, bsize, plane, raster_block,
+                                                 pd->dst.buf, pd->dst.stride);
+  int16_t *const src_diff = raster_block_offset_int16(xd, bsize, plane,
+                                                      raster_block,
+                                                      p->src_diff);
 
-    optimize_b(cm, x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,
-               ta + x_idx, tl + y_idx, TX_4X4);
-  }
-}
+  const int txfm_b_size = 4 << tx_size;
+  int ib = raster_block;
+  int tx_ib = ib >> tx_size;
+  int plane_b_size;
 
-void vp9_optimize_sb64uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) x->e_mbd.above_context;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) x->e_mbd.left_context;
-  ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec;
-  int b;
+  TX_TYPE tx_type;
+  int mode, b_mode;
 
-  for (b = 256; b < 384; b += 64) {
-    const int cidx = b >= 320 ? 20 : 16;
-    a = ta + vp9_block2above_sb64[TX_32X32][b];
-    l = tl + vp9_block2left_sb64[TX_32X32][b];
-    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    a2 = a + 2 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    l2 = l + 2 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    a3 = a + 3 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    l3 = l + 3 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    a_ec = (a[0] + a[1] + a1[0] + a1[1] + a2[0] + a2[1] + a3[0] + a3[1]) != 0;
-    l_ec = (l[0] + l[1] + l1[0] + l1[1] + l2[0] + l2[1] + l3[0] + l3[1]) != 0;
-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
-               &a_ec, &l_ec, TX_32X32);
+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
+    extend_for_intra(xd, plane, block, bsize, ss_txfrm_size);
   }
-}
 
-void vp9_optimize_sb64uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;
-  ENTROPY_CONTEXT *a, *l, *a1, *l1, above_ec, left_ec;
-  int b;
+  mode = plane == 0? mbmi->mode: mbmi->uv_mode;
+  if (plane == 0 &&
+      mbmi->sb_type < BLOCK_SIZE_SB8X8 &&
+      mbmi->ref_frame[0] == INTRA_FRAME)
+    b_mode = xd->mode_info_context->bmi[ib].as_mode.first;
+  else
+    b_mode = mode;
 
-  vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above));
-  vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left));
-  for (b = 256; b < 384; b += 16) {
-    const int cidx = b >= 320 ? 20 : 16;
-    a = ta + vp9_block2above_sb64[TX_16X16][b];
-    l = tl + vp9_block2left_sb64[TX_16X16][b];
-    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    above_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
-    left_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
-               &above_ec, &left_ec, TX_16X16);
-    a[0] = a[1] = a1[0] = a1[1] = above_ec;
-    l[0] = l[1] = l1[0] = l1[1] = left_ec;
-  }
-}
+  assert(b_mode >= DC_PRED && b_mode <= TM_PRED);
 
-void vp9_optimize_sb64uv_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;
-  ENTROPY_CONTEXT *a, *l, above_ec, left_ec;
-  int b;
+  plane_b_size = b_width_log2(bsize) - pd->subsampling_x;
+  vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size, b_mode,
+                          dst, pd->dst.stride);
+  vp9_subtract_block(txfm_b_size, txfm_b_size, src_diff, bw,
+                     src, p->src.stride, dst, pd->dst.stride);
 
-  vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above));
-  vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left));
-  for (b = 256; b < 384; b += 4) {
-    const int cidx = b >= 320 ? 20 : 16;
-    a = ta + vp9_block2above_sb64[TX_8X8][b];
-    l = tl + vp9_block2left_sb64[TX_8X8][b];
-    above_ec = (a[0] + a[1]) != 0;
-    left_ec = (l[0] + l[1]) != 0;
-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
-               &above_ec, &left_ec, TX_8X8);
-    a[0] = a[1] = above_ec;
-    l[0] = l[1] = left_ec;
-  }
-}
+  xform_quant(plane, block, bsize, ss_txfrm_size, arg);
 
-void vp9_optimize_sb64uv_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;
-  ENTROPY_CONTEXT *a, *l;
-  int b;
 
-  vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above));
-  vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left));
-  for (b = 256; b < 384; b++) {
-    const int cidx = b >= 320 ? 20 : 16;
-    a = ta + vp9_block2above_sb64[TX_4X4][b];
-    l = tl + vp9_block2left_sb64[TX_4X4][b];
-    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,
-               a, l, TX_4X4);
-  }
-}
+  // if (x->optimize)
+  // vp9_optimize_b(plane, block, bsize, ss_txfrm_size,
+  //                args->cm, x, args->ctx);
 
-void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
-
-  if (tx_size == TX_16X16) {
-    vp9_transform_mb_16x16(x);
-    vp9_quantize_mb_16x16(x);
-    if (x->optimize)
-      optimize_mb_16x16(cm, x);
-    vp9_inverse_transform_mb_16x16(xd);
-  } else if (tx_size == TX_8X8) {
-    if (xd->mode_info_context->mbmi.mode == SPLITMV) {
-      assert(xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4);
-      vp9_transform_mby_8x8(x);
-      vp9_transform_mbuv_4x4(x);
-      vp9_quantize_mby_8x8(x);
-      vp9_quantize_mbuv_4x4(x);
-      if (x->optimize) {
-        vp9_optimize_mby_8x8(cm, x);
-        vp9_optimize_mbuv_4x4(cm, x);
-      }
-      vp9_inverse_transform_mby_8x8(xd);
-      vp9_inverse_transform_mbuv_4x4(xd);
-    } else {
-      vp9_transform_mb_8x8(x);
-      vp9_quantize_mb_8x8(x);
-      if (x->optimize)
-        optimize_mb_8x8(cm, x);
-      vp9_inverse_transform_mb_8x8(xd);
-    }
-  } else {
-    transform_mb_4x4(x);
-    vp9_quantize_mb_4x4(x);
-    if (x->optimize)
-      optimize_mb_4x4(cm, x);
-    vp9_inverse_transform_mb_4x4(xd);
+  switch (ss_txfrm_size / 2) {
+    case TX_32X32:
+        vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
+      break;
+    case TX_16X16:
+      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
+      if (tx_type == DCT_DCT)
+        vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);
+      else
+        vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type);
+      break;
+    case TX_8X8:
+      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
+      if (tx_type == DCT_DCT)
+        vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
+      else
+        vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type);
+      break;
+    case TX_4X4:
+      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
+      if (tx_type == DCT_DCT)
+        // this is like vp9_short_idct4x4 but has a special case around eob<=1
+        // which is significant (not just an optimization) for the lossless
+        // case.
+        vp9_inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff,
+                                        dst, pd->dst.stride);
+      else
+        vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);
+      break;
   }
 }
 
-void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                           int mb_row, int mb_col) {
-  MACROBLOCKD *const xd = &x->e_mbd;
+void vp9_encode_intra_block_y(VP9_COMMON *cm, MACROBLOCK *x,
+                              BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  struct encode_b_args arg = {cm, x, &ctx};
 
-  vp9_build_inter_predictors_mb(xd, mb_row, mb_col);
-  subtract_mb(x);
-  vp9_fidct_mb(cm, x);
-  vp9_recon_mb(xd);
+  foreach_transformed_block_in_plane(xd, bsize, 0,
+                                     encode_block_intra, &arg);
 }
-
-/* this function is used by first pass only */
-void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  BLOCK *b = &x->block[0];
-
-  vp9_build_inter16x16_predictors_mby(xd, xd->predictor, 16, mb_row, mb_col);
-
-  vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);
-
-  vp9_transform_mby_4x4(x);
-  vp9_quantize_mby_4x4(x);
-  vp9_inverse_transform_mby_4x4(xd);
-
-  vp9_recon_mby(xd);
+void vp9_encode_intra_block_uv(VP9_COMMON *cm, MACROBLOCK *x,
+                              BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  struct encode_b_args arg = {cm, x, &ctx};
+  foreach_transformed_block_uv(xd, bsize, encode_block_intra, &arg);
 }
+
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -22,82 +22,32 @@
   MV_REFERENCE_FRAME second_ref_frame;
 } MODE_DEFINITION;
 
+struct optimize_ctx {
+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
+};
 
-struct VP9_ENCODER_RTCD;
-void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                           int mb_row, int mb_col);
+void vp9_optimize_init(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
+                       struct optimize_ctx *ctx);
+void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                    int ss_txfrm_size, VP9_COMMON *cm, MACROBLOCK *x,
+                    struct optimize_ctx *ctx);
+void vp9_optimize_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_optimize_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 
-void vp9_transform_mbuv_4x4(MACROBLOCK *x);
-void vp9_transform_mby_4x4(MACROBLOCK *x);
+void vp9_encode_sb(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_encode_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 
-void vp9_optimize_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *x);
-void vp9_optimize_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x);
-void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col);
+void vp9_xform_quant_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_xform_quant_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 
-void vp9_transform_mb_8x8(MACROBLOCK *mb);
-void vp9_transform_mby_8x8(MACROBLOCK *x);
-void vp9_transform_mbuv_8x8(MACROBLOCK *x);
-void vp9_optimize_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *x);
-void vp9_optimize_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x);
-
-void vp9_transform_mb_16x16(MACROBLOCK *mb);
-void vp9_transform_mby_16x16(MACROBLOCK *x);
-void vp9_optimize_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *x);
-
-void vp9_transform_sby_32x32(MACROBLOCK *x);
-void vp9_optimize_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x);
-void vp9_transform_sby_16x16(MACROBLOCK *x);
-void vp9_optimize_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x);
-void vp9_transform_sby_8x8(MACROBLOCK *x);
-void vp9_optimize_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x);
-void vp9_transform_sby_4x4(MACROBLOCK *x);
-void vp9_optimize_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x);
-void vp9_transform_sbuv_16x16(MACROBLOCK *x);
-void vp9_optimize_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x);
-void vp9_transform_sbuv_8x8(MACROBLOCK *x);
-void vp9_optimize_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x);
-void vp9_transform_sbuv_4x4(MACROBLOCK *x);
-void vp9_optimize_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x);
-
-void vp9_transform_sb64y_32x32(MACROBLOCK *x);
-void vp9_optimize_sb64y_32x32(VP9_COMMON *const cm, MACROBLOCK *x);
-void vp9_transform_sb64y_16x16(MACROBLOCK *x);
-void vp9_optimize_sb64y_16x16(VP9_COMMON *const cm, MACROBLOCK *x);
-void vp9_transform_sb64y_8x8(MACROBLOCK *x);
-void vp9_optimize_sb64y_8x8(VP9_COMMON *const cm, MACROBLOCK *x);
-void vp9_transform_sb64y_4x4(MACROBLOCK *x);
-void vp9_optimize_sb64y_4x4(VP9_COMMON *const cm, MACROBLOCK *x);
-void vp9_transform_sb64uv_32x32(MACROBLOCK *x);
-void vp9_optimize_sb64uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x);
-void vp9_transform_sb64uv_16x16(MACROBLOCK *x);
-void vp9_optimize_sb64uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x);
-void vp9_transform_sb64uv_8x8(MACROBLOCK *x);
-void vp9_optimize_sb64uv_8x8(VP9_COMMON *const cm, MACROBLOCK *x);
-void vp9_transform_sb64uv_4x4(MACROBLOCK *x);
-void vp9_optimize_sb64uv_4x4(VP9_COMMON *const cm, MACROBLOCK *x);
-
-void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x);
-
-void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);
-
-void vp9_subtract_mbuv_s_c(int16_t *diff, const uint8_t *usrc,
-                           const uint8_t *vsrc, int src_stride,
-                           const uint8_t *upred,
-                           const uint8_t *vpred, int dst_stride);
-void vp9_subtract_mby_s_c(int16_t *diff, const uint8_t *src,
-                          int src_stride, const uint8_t *pred,
-                          int dst_stride);
-void vp9_subtract_sby_s_c(int16_t *diff, const uint8_t *src, int src_stride,
-                          const uint8_t *pred, int dst_stride);
-void vp9_subtract_sbuv_s_c(int16_t *diff, const uint8_t *usrc,
-                           const uint8_t *vsrc, int src_stride,
-                           const uint8_t *upred,
-                           const uint8_t *vpred, int dst_stride);
-void vp9_subtract_sb64y_s_c(int16_t *diff, const uint8_t *src, int src_stride,
-                            const uint8_t *pred, int dst_stride);
-void vp9_subtract_sb64uv_s_c(int16_t *diff, const uint8_t *usrc,
-                             const uint8_t *vsrc, int src_stride,
-                             const uint8_t *upred,
-                             const uint8_t *vpred, int dst_stride);
+void vp9_subtract_block(int rows, int cols,
+                        int16_t *diff_ptr, int diff_stride,
+                        const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *pred_ptr, int pred_stride);
+void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_subtract_sb(MACROBLOCK *xd, BLOCK_SIZE_TYPE bsize);
 
 #endif  // VP9_ENCODER_VP9_ENCODEMB_H_
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -24,68 +24,48 @@
 nmv_context_counts tnmvcounts;
 #endif
 
-static void encode_nmv_component(vp9_writer* const bc,
-                                 int v,
-                                 int r,
-                                 const nmv_component* const mvcomp) {
-  int s, z, c, o, d;
-  assert (v != 0);            /* should not be zero */
-  s = v < 0;
-  vp9_write(bc, s, mvcomp->sign);
-  z = (s ? -v : v) - 1;       /* magnitude - 1 */
+static void encode_mv_component(vp9_writer* w, int comp,
+                                const nmv_component* mvcomp, int usehp) {
+  int offset;
+  const int sign = comp < 0;
+  const int mag = sign ? -comp : comp;
+  const int mv_class = vp9_get_mv_class(mag - 1, &offset);
+  const int d = offset >> 3;                // int mv data
+  const int fr = (offset >> 1) & 3;         // fractional mv data
+  const int hp = offset & 1;                // high precision mv data
 
-  c = vp9_get_mv_class(z, &o);
+  assert(comp != 0);
 
-  write_token(bc, vp9_mv_class_tree, mvcomp->classes,
-              vp9_mv_class_encodings + c);
+  // Sign
+  vp9_write(w, sign, mvcomp->sign);
 
-  d = (o >> 3);               /* int mv data */
+  // Class
+  write_token(w, vp9_mv_class_tree, mvcomp->classes,
+              &vp9_mv_class_encodings[mv_class]);
 
-  if (c == MV_CLASS_0) {
-    write_token(bc, vp9_mv_class0_tree, mvcomp->class0,
-                vp9_mv_class0_encodings + d);
+  // Integer bits
+  if (mv_class == MV_CLASS_0) {
+    write_token(w, vp9_mv_class0_tree, mvcomp->class0,
+                &vp9_mv_class0_encodings[d]);
   } else {
-    int i, b;
-    b = c + CLASS0_BITS - 1;  /* number of bits */
-    for (i = 0; i < b; ++i)
-      vp9_write(bc, ((d >> i) & 1), mvcomp->bits[i]);
+    int i;
+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
+    for (i = 0; i < n; ++i)
+      vp9_write(w, (d >> i) & 1, mvcomp->bits[i]);
   }
-}
 
-static void encode_nmv_component_fp(vp9_writer *bc,
-                                    int v,
-                                    int r,
-                                    const nmv_component* const mvcomp,
-                                    int usehp) {
-  int s, z, c, o, d, f, e;
-  assert (v != 0);            /* should not be zero */
-  s = v < 0;
-  z = (s ? -v : v) - 1;       /* magnitude - 1 */
+  // Fractional bits
+  write_token(w, vp9_mv_fp_tree,
+              mv_class == MV_CLASS_0 ?  mvcomp->class0_fp[d] : mvcomp->fp,
+              &vp9_mv_fp_encodings[fr]);
 
-  c = vp9_get_mv_class(z, &o);
-
-  d = (o >> 3);               /* int mv data */
-  f = (o >> 1) & 3;           /* fractional pel mv data */
-  e = (o & 1);                /* high precision mv data */
-
-  /* Code the fractional pel bits */
-  if (c == MV_CLASS_0) {
-    write_token(bc, vp9_mv_fp_tree, mvcomp->class0_fp[d],
-                vp9_mv_fp_encodings + f);
-  } else {
-    write_token(bc, vp9_mv_fp_tree, mvcomp->fp,
-                vp9_mv_fp_encodings + f);
-  }
-  /* Code the high precision bit */
-  if (usehp) {
-    if (c == MV_CLASS_0) {
-      vp9_write(bc, e, mvcomp->class0_hp);
-    } else {
-      vp9_write(bc, e, mvcomp->hp);
-    }
-  }
+  // High precision bit
+  if (usehp)
+    vp9_write(w, hp,
+              mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp);
 }
 
+
 static void build_nmv_component_cost_table(int *mvcost,
                                            const nmv_component* const mvcomp,
                                            int usehp) {
@@ -556,30 +536,19 @@
   }
 }
 
-void vp9_encode_nmv(vp9_writer* const bc, const MV* const mv,
-                    const MV* const ref, const nmv_context* const mvctx) {
-  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
-  write_token(bc, vp9_mv_joint_tree, mvctx->joints,
-              vp9_mv_joint_encodings + j);
-  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
-    encode_nmv_component(bc, mv->row, ref->col, &mvctx->comps[0]);
-  }
-  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
-    encode_nmv_component(bc, mv->col, ref->col, &mvctx->comps[1]);
-  }
-}
-
-void vp9_encode_nmv_fp(vp9_writer* const bc, const MV* const mv,
-                       const MV* const ref, const nmv_context* const mvctx,
-                       int usehp) {
-  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);
+void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref,
+                   const nmv_context* mvctx, int usehp) {
+  const MV diff = {mv->row - ref->row,
+                   mv->col - ref->col};
+  const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff);
   usehp = usehp && vp9_use_nmv_hp(ref);
-  if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {
-    encode_nmv_component_fp(bc, mv->row, ref->row, &mvctx->comps[0], usehp);
-  }
-  if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {
-    encode_nmv_component_fp(bc, mv->col, ref->col, &mvctx->comps[1], usehp);
-  }
+
+  write_token(w, vp9_mv_joint_tree, mvctx->joints, &vp9_mv_joint_encodings[j]);
+  if (mv_joint_vertical(j))
+    encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
+
+  if (mv_joint_horizontal(j))
+    encode_mv_component(w, diff.col, &mvctx->comps[1], usehp);
 }
 
 void vp9_build_nmv_cost_table(int *mvjoint,
@@ -600,62 +569,42 @@
                          int_mv *best_ref_mv, int_mv *second_best_ref_mv) {
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   MV mv;
+  int bwl = b_width_log2(mbmi->sb_type), bw = 1 << bwl;
+  int bhl = b_height_log2(mbmi->sb_type), bh = 1 << bhl;
+  int idx, idy;
 
-  if (mbmi->mode == SPLITMV) {
+  if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
     int i;
-
-    for (i = 0; i < x->partition_info->count; i++) {
-      if (x->partition_info->bmi[i].mode == NEW4X4) {
-        if (x->e_mbd.allow_high_precision_mv) {
-          mv.row = (x->partition_info->bmi[i].mv.as_mv.row
-                    - best_ref_mv->as_mv.row);
-          mv.col = (x->partition_info->bmi[i].mv.as_mv.col
-                    - best_ref_mv->as_mv.col);
-          vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 1);
-          if (x->e_mbd.mode_info_context->mbmi.second_ref_frame > 0) {
-            mv.row = (x->partition_info->bmi[i].second_mv.as_mv.row
-                      - second_best_ref_mv->as_mv.row);
-            mv.col = (x->partition_info->bmi[i].second_mv.as_mv.col
-                      - second_best_ref_mv->as_mv.col);
-            vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv,
-                              &cpi->NMVcount, 1);
+    PARTITION_INFO *pi = x->partition_info;
+    for (idy = 0; idy < 2; idy += bh) {
+      for (idx = 0; idx < 2; idx += bw) {
+        i = idy * 2 + idx;
+        if (pi->bmi[i].mode == NEWMV) {
+          mv.row = (pi->bmi[i].mv.as_mv.row - best_ref_mv->as_mv.row);
+          mv.col = (pi->bmi[i].mv.as_mv.col - best_ref_mv->as_mv.col);
+          vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount,
+                            x->e_mbd.allow_high_precision_mv);
+          if (x->e_mbd.mode_info_context->mbmi.ref_frame[1] > INTRA_FRAME) {
+            mv.row = pi->bmi[i].second_mv.as_mv.row -
+                         second_best_ref_mv->as_mv.row;
+            mv.col = pi->bmi[i].second_mv.as_mv.col -
+                         second_best_ref_mv->as_mv.col;
+            vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount,
+                              x->e_mbd.allow_high_precision_mv);
           }
-        } else {
-          mv.row = (x->partition_info->bmi[i].mv.as_mv.row
-                    - best_ref_mv->as_mv.row);
-          mv.col = (x->partition_info->bmi[i].mv.as_mv.col
-                    - best_ref_mv->as_mv.col);
-          vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 0);
-          if (x->e_mbd.mode_info_context->mbmi.second_ref_frame > 0) {
-            mv.row = (x->partition_info->bmi[i].second_mv.as_mv.row
-                      - second_best_ref_mv->as_mv.row);
-            mv.col = (x->partition_info->bmi[i].second_mv.as_mv.col
-                      - second_best_ref_mv->as_mv.col);
-            vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv,
-                              &cpi->NMVcount, 0);
-          }
         }
       }
     }
   } else if (mbmi->mode == NEWMV) {
-    if (x->e_mbd.allow_high_precision_mv) {
-      mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);
-      mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);
-      vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 1);
-      if (mbmi->second_ref_frame > 0) {
-        mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);
-        mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);
-        vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount, 1);
-      }
-    } else {
-      mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);
-      mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);
-      vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, 0);
-      if (mbmi->second_ref_frame > 0) {
-        mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);
-        mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);
-        vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount, 0);
-      }
+    mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row);
+    mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col);
+    vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount,
+                      x->e_mbd.allow_high_precision_mv);
+    if (mbmi->ref_frame[1] > INTRA_FRAME) {
+      mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row);
+      mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col);
+      vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount,
+                        x->e_mbd.allow_high_precision_mv);
     }
   }
 }
--- a/vp9/encoder/vp9_encodemv.h
+++ b/vp9/encoder/vp9_encodemv.h
@@ -15,11 +15,10 @@
 #include "vp9/encoder/vp9_onyx_int.h"
 
 void vp9_write_nmv_probs(VP9_COMP* const, int usehp, vp9_writer* const);
-void vp9_encode_nmv(vp9_writer* const w, const MV* const mv,
-                    const MV* const ref, const nmv_context* const mvctx);
-void vp9_encode_nmv_fp(vp9_writer* const w, const MV* const mv,
-                       const MV* const ref, const nmv_context* const mvctx,
-                       int usehp);
+
+void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref,
+                   const nmv_context* mvctx, int usehp);
+
 void vp9_build_nmv_cost_table(int *mvjoint,
                               int *mvcost[2],
                               const nmv_context* const mvctx,
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -14,7 +14,6 @@
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_variance.h"
 #include "vp9/encoder/vp9_encodeintra.h"
-#include "vp9/common/vp9_setupintrarecon.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vpx_scale/vpx_scale.h"
@@ -23,7 +22,7 @@
 #include "vp9/common/vp9_extend.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_swapyv12buffer.h"
+#include "vpx_scale/yv12config.h"
 #include <stdio.h>
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rdopt.h"
@@ -32,6 +31,8 @@
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/encoder/vp9_encodemv.h"
 #include "./vpx_scale_rtcd.h"
+// TODO(jkoleszar): for setup_dst_planes
+#include "vp9/common/vp9_reconinter.h"
 
 #define OUTPUT_FPF 0
 
@@ -38,7 +39,7 @@
 #define IIFACTOR   12.5
 #define IIKFACTOR1 12.5
 #define IIKFACTOR2 15.0
-#define RMAX       128.0
+#define RMAX       512.0
 #define GF_RMAX    96.0
 #define ERR_DIVISOR   150.0
 #define MIN_DECAY_FACTOR 0.1
@@ -46,11 +47,17 @@
 #define KF_MB_INTRA_MIN 150
 #define GF_MB_INTRA_MIN 100
 
-#define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)
+#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
 
 #define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0
 #define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0
 
+static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
+  YV12_BUFFER_CONFIG temp = *a;
+  *a = *b;
+  *b = temp;
+}
+
 static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame);
 
 static int select_cq_level(int qindex) {
@@ -71,8 +78,8 @@
 
 
 // Resets the first pass file to the given position using a relative seek from the current position
-static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *Position) {
-  cpi->twopass.stats_in = Position;
+static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *position) {
+  cpi->twopass.stats_in = position;
 }
 
 static int lookup_next_frame_stats(VP9_COMP *cpi, FIRSTPASS_STATS *next_frame) {
@@ -128,7 +135,7 @@
     FILE *fpfile;
     fpfile = fopen("firstpass.stt", "a");
 
-    fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
+    fprintf(stdout, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
             "%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
             "%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n",
             stats->frame,
@@ -245,17 +252,11 @@
 
 // Calculate a modified Error used in distributing bits between easier and harder frames
 static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  double av_err = (cpi->twopass.total_stats->ssim_weighted_pred_err /
-                   cpi->twopass.total_stats->count);
-  double this_err = this_frame->ssim_weighted_pred_err;
-  double modified_err;
-
-  if (this_err > av_err)
-    modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW1);
-  else
-    modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW2);
-
-  return modified_err;
+  const FIRSTPASS_STATS *const stats = &cpi->twopass.total_stats;
+  const double av_err = stats->ssim_weighted_pred_err / stats->count;
+  const double this_err = this_frame->ssim_weighted_pred_err;
+  return av_err * pow(this_err / DOUBLE_DIVIDE_CHECK(av_err),
+                      this_err > av_err ? POW1 : POW2);
 }
 
 static const double weight_table[256] = {
@@ -317,46 +318,69 @@
 }
 
 
-// This function returns the current per frame maximum bitrate target
+// This function returns the current per frame maximum bitrate target.
 static int frame_max_bits(VP9_COMP *cpi) {
-  // Max allocation for a single frame based on the max section guidelines passed in and how many bits are left
-  int max_bits;
+  // Max allocation for a single frame based on the max section guidelines
+  // passed in and how many bits are left.
+  // For VBR base this on the bits and frames left plus the
+  // two_pass_vbrmax_section rate passed in by the user.
+  const double max_bits = (1.0 * cpi->twopass.bits_left /
+      (cpi->twopass.total_stats.count - cpi->common.current_video_frame)) *
+      (cpi->oxcf.two_pass_vbrmax_section / 100.0);
 
-  // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user
-  max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats->count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
-
-  // Trap case where we are out of bits
-  if (max_bits < 0)
-    max_bits = 0;
-
-  return max_bits;
+  // Trap case where we are out of bits.
+  return MAX((int)max_bits, 0);
 }
 
 void vp9_init_first_pass(VP9_COMP *cpi) {
-  zero_stats(cpi->twopass.total_stats);
+  zero_stats(&cpi->twopass.total_stats);
 }
 
 void vp9_end_first_pass(VP9_COMP *cpi) {
-  output_stats(cpi, cpi->output_pkt_list, cpi->twopass.total_stats);
+  output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
 }
 
 static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  BLOCK *b = &x->block[0];
-  BLOCKD *d = &x->e_mbd.block[0];
 
-  uint8_t *src_ptr = (*(b->base_src) + b->src);
-  int src_stride = b->src_stride;
-  uint8_t *ref_ptr;
-  int ref_stride = d->pre_stride;
-
   // Set up pointers for this macro block recon buffer
-  xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
+  xd->plane[0].pre[0].buf = recon_buffer->y_buffer + recon_yoffset;
 
-  ref_ptr = (uint8_t *)(*(d->base_pre) + d->pre);
+  switch (xd->mode_info_context->mbmi.sb_type) {
+    case BLOCK_SIZE_SB8X8:
+      vp9_mse8x8(x->plane[0].src.buf, x->plane[0].src.stride,
+                 xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
+                 (unsigned int *)(best_motion_err));
+      break;
+    case BLOCK_SIZE_SB16X8:
+      vp9_mse16x8(x->plane[0].src.buf, x->plane[0].src.stride,
+                  xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
+                  (unsigned int *)(best_motion_err));
+      break;
+    case BLOCK_SIZE_SB8X16:
+      vp9_mse8x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                  xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
+                  (unsigned int *)(best_motion_err));
+      break;
+    default:
+      vp9_mse16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                   xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
+                   (unsigned int *)(best_motion_err));
+      break;
+  }
+}
 
-  vp9_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride,
-               (unsigned int *)(best_motion_err));
+static enum BlockSize get_bs(BLOCK_SIZE_TYPE b) {
+  switch (b) {
+    case BLOCK_SIZE_SB8X8:
+      return BLOCK_8X8;
+    case BLOCK_SIZE_SB16X8:
+      return BLOCK_16X8;
+    case BLOCK_SIZE_SB8X16:
+      return BLOCK_8X16;
+    default:
+      return BLOCK_16X16;
+  }
 }
 
 static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
@@ -364,8 +388,6 @@
                                      YV12_BUFFER_CONFIG *recon_buffer,
                                      int *best_motion_err, int recon_yoffset) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  BLOCK *b = &x->block[0];
-  BLOCKD *d = &x->e_mbd.block[0];
   int num00;
 
   int_mv tmp_mv;
@@ -375,7 +397,8 @@
   int step_param = 3;
   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
   int n;
-  vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
+  vp9_variance_fn_ptr_t v_fn_ptr =
+      cpi->fn_ptr[get_bs(xd->mode_info_context->mbmi.sb_type)];
   int new_mv_mode_penalty = 256;
 
   int sr = 0;
@@ -392,16 +415,29 @@
   further_steps -= sr;
 
   // override the default variance function to use MSE
-  v_fn_ptr.vf = vp9_mse16x16;
+  switch (xd->mode_info_context->mbmi.sb_type) {
+    case BLOCK_SIZE_SB8X8:
+      v_fn_ptr.vf = vp9_mse8x8;
+      break;
+    case BLOCK_SIZE_SB16X8:
+      v_fn_ptr.vf = vp9_mse16x8;
+      break;
+    case BLOCK_SIZE_SB8X16:
+      v_fn_ptr.vf = vp9_mse8x16;
+      break;
+    default:
+      v_fn_ptr.vf = vp9_mse16x16;
+      break;
+  }
 
   // Set up pointers for this macro block recon buffer
-  xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
+  xd->plane[0].pre[0].buf = recon_buffer->y_buffer + recon_yoffset;
 
   // Initial step/diamond search centred on best mv
   tmp_mv.as_int = 0;
   ref_mv_full.as_mv.col = ref_mv->as_mv.col >> 3;
   ref_mv_full.as_mv.row = ref_mv->as_mv.row >> 3;
-  tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv, step_param,
+  tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv, step_param,
                                     x->sadperbit16, &num00, &v_fn_ptr,
                                     x->nmvjointcost,
                                     x->mvcost, ref_mv);
@@ -424,7 +460,7 @@
     if (num00)
       num00--;
     else {
-      tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv,
+      tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv,
                                         step_param + n, x->sadperbit16,
                                         &num00, &v_fn_ptr,
                                         x->nmvjointcost,
@@ -448,13 +484,13 @@
   MACROBLOCKD *const xd = &x->e_mbd;
 
   int recon_yoffset, recon_uvoffset;
-  YV12_BUFFER_CONFIG *lst_yv12 =
-      &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]];
-  YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
-  YV12_BUFFER_CONFIG *gld_yv12 =
-      &cm->yv12_fb[cm->ref_frame_map[cpi->gld_fb_idx]];
-  int recon_y_stride = lst_yv12->y_stride;
-  int recon_uv_stride = lst_yv12->uv_stride;
+  const int lst_yv12_idx = cm->ref_frame_map[cpi->lst_fb_idx];
+  const int gld_yv12_idx = cm->ref_frame_map[cpi->gld_fb_idx];
+  YV12_BUFFER_CONFIG *const lst_yv12 = &cm->yv12_fb[lst_yv12_idx];
+  YV12_BUFFER_CONFIG *const new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+  YV12_BUFFER_CONFIG *const gld_yv12 = &cm->yv12_fb[gld_yv12_idx];
+  const int recon_y_stride = lst_yv12->y_stride;
+  const int recon_uv_stride = lst_yv12->uv_stride;
   int64_t intra_error = 0;
   int64_t coded_error = 0;
   int64_t sr_coded_error = 0;
@@ -477,9 +513,9 @@
 
   vp9_clear_system_state();  // __asm emms;
 
-  x->src = * cpi->Source;
-  xd->pre = *lst_yv12;
-  xd->dst = *new_yv12;
+  vp9_setup_src_planes(x, cpi->Source, 0, 0);
+  setup_pre_planes(xd, lst_yv12, NULL, 0, 0, NULL, NULL);
+  setup_dst_planes(xd, new_yv12, 0, 0);
 
   x->partition_info = x->pi;
 
@@ -487,12 +523,8 @@
 
   vp9_build_block_offsets(x);
 
-  vp9_setup_block_dptrs(&x->e_mbd);
+  vp9_setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
-  vp9_setup_block_ptrs(x);
-
-  // set up frame new frame for intra coded blocks
-  vp9_setup_intra_recon(new_yv12);
   vp9_frame_init_quantizer(cpi);
 
   // Initialise the MV cost table to the defaults
@@ -500,7 +532,7 @@
   // if ( 0 )
   {
     vp9_init_mv_probs(cm);
-    vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);
+    vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y_dc_delta_q);
   }
 
   // for each macroblock row in image
@@ -515,11 +547,10 @@
     recon_uvoffset = (mb_row * recon_uv_stride * 8);
 
     // Set up limit values for motion vectors to prevent them extending outside the UMV borders
-    x->mv_row_min = -((mb_row * 16) + (VP9BORDERINPIXELS - 16));
+    x->mv_row_min = -((mb_row * 16) + (VP9BORDERINPIXELS - 8));
     x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
-                    + (VP9BORDERINPIXELS - 16);
+                    + (VP9BORDERINPIXELS - 8);
 
-
     // for each macroblock col in image
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
       int this_error;
@@ -526,11 +557,31 @@
       int gf_motion_error = INT_MAX;
       int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
 
-      xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;
-      xd->dst.u_buffer = new_yv12->u_buffer + recon_uvoffset;
-      xd->dst.v_buffer = new_yv12->v_buffer + recon_uvoffset;
+      xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
+      xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
+      xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
       xd->left_available = (mb_col != 0);
 
+      if (mb_col * 2 + 1 < cm->mi_cols) {
+        if (mb_row * 2 + 1 < cm->mi_rows) {
+          xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_MB16X16;
+        } else {
+          xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB16X8;
+        }
+      } else {
+        if (mb_row * 2 + 1 < cm->mi_rows) {
+          xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB8X16;
+        } else {
+          xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB8X8;
+        }
+      }
+      xd->mode_info_context->mbmi.ref_frame[0] = INTRA_FRAME;
+      set_mi_row_col(cm, xd,
+                     mb_row << 1,
+                     1 << mi_height_log2(xd->mode_info_context->mbmi.sb_type),
+                     mb_col << 1,
+                     1 << mi_height_log2(xd->mode_info_context->mbmi.sb_type));
+
       // do intra 16x16 prediction
       this_error = vp9_encode_intra(cpi, x, use_dc_pred);
 
@@ -544,9 +595,9 @@
       intra_error += (int64_t)this_error;
 
       // Set up limit values for motion vectors to prevent them extending outside the UMV borders
-      x->mv_col_min = -((mb_col * 16) + (VP9BORDERINPIXELS - 16));
+      x->mv_col_min = -((mb_col * 16) + (VP9BORDERINPIXELS - 8));
       x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16)
-                      + (VP9BORDERINPIXELS - 16);
+                      + (VP9BORDERINPIXELS - 8);
 
       // Other than for the first frame do a motion search
       if (cm->current_video_frame > 0) {
@@ -592,9 +643,9 @@
           }
 
           // Reset to last frame as reference buffer
-          xd->pre.y_buffer = lst_yv12->y_buffer + recon_yoffset;
-          xd->pre.u_buffer = lst_yv12->u_buffer + recon_uvoffset;
-          xd->pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset;
+          xd->plane[0].pre[0].buf = lst_yv12->y_buffer + recon_yoffset;
+          xd->plane[1].pre[0].buf = lst_yv12->u_buffer + recon_uvoffset;
+          xd->plane[2].pre[0].buf = lst_yv12->v_buffer + recon_uvoffset;
 
           // In accumulating a score for the older reference frame
           // take the best of the motion predicted score and
@@ -626,7 +677,12 @@
           this_error = motion_error;
           vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
           xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-          vp9_encode_inter16x16y(x, mb_row, mb_col);
+          xd->mode_info_context->mbmi.ref_frame[0] = LAST_FRAME;
+          xd->mode_info_context->mbmi.ref_frame[1] = NONE;
+          vp9_build_inter_predictors_sby(xd, mb_row << 1,
+                                         mb_col << 1,
+                                         xd->mode_info_context->mbmi.sb_type);
+          vp9_encode_sby(cm, x, xd->mode_info_context->mbmi.sb_type);
           sum_mvr += mv.as_mv.row;
           sum_mvr_abs += abs(mv.as_mv.row);
           sum_mvc += mv.as_mv.col;
@@ -679,9 +735,9 @@
       coded_error += (int64_t)this_error;
 
       // adjust to the next column of macroblocks
-      x->src.y_buffer += 16;
-      x->src.u_buffer += 8;
-      x->src.v_buffer += 8;
+      x->plane[0].src.buf += 16;
+      x->plane[1].src.buf += 8;
+      x->plane[2].src.buf += 8;
 
       recon_yoffset += 16;
       recon_uvoffset += 8;
@@ -688,13 +744,10 @@
     }
 
     // adjust to the next row of mbs
-    x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
-    x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
-    x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+    x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
+    x->plane[1].src.buf += 8 * x->plane[1].src.stride - 8 * cm->mb_cols;
+    x->plane[2].src.buf += 8 * x->plane[1].src.stride - 8 * cm->mb_cols;
 
-    // extend the recon for intra prediction
-    vp9_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,
-                      xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
     vp9_clear_system_state();  // __asm emms;
   }
 
@@ -746,16 +799,14 @@
     }
 
     // TODO:  handle the case when duration is set to 0, or something less
-    // than the full time between subsequent cpi->source_time_stamp s  .
+    // than the full time between subsequent values of cpi->source_time_stamp.
     fps.duration = (double)(cpi->source->ts_end
                             - cpi->source->ts_start);
 
     // don't want to do output stats with a stack variable!
-    memcpy(cpi->twopass.this_frame_stats,
-           &fps,
-           sizeof(FIRSTPASS_STATS));
-    output_stats(cpi, cpi->output_pkt_list, cpi->twopass.this_frame_stats);
-    accumulate_stats(cpi->twopass.total_stats, &fps);
+    cpi->twopass.this_frame_stats = fps;
+    output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.this_frame_stats);
+    accumulate_stats(&cpi->twopass.total_stats, &fps);
   }
 
   // Copy the previous Last Frame back into gf and and arf buffers if
@@ -762,9 +813,9 @@
   // the prediction is good enough... but also dont allow it to lag too far
   if ((cpi->twopass.sr_update_lag > 3) ||
       ((cm->current_video_frame > 0) &&
-       (cpi->twopass.this_frame_stats->pcnt_inter > 0.20) &&
-       ((cpi->twopass.this_frame_stats->intra_error /
-         DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats->coded_error)) >
+       (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) &&
+       ((cpi->twopass.this_frame_stats.intra_error /
+         DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats.coded_error)) >
         2.0))) {
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
     cpi->twopass.sr_update_lag = 1;
@@ -772,15 +823,14 @@
     cpi->twopass.sr_update_lag++;
 
   // swap frame pointers so last frame refers to the frame we just compressed
-  vp9_swap_yv12_buffer(lst_yv12, new_yv12);
-  vp8_yv12_extend_frame_borders(lst_yv12);
+  swap_yv12(lst_yv12, new_yv12);
 
+  vp9_extend_frame_borders(lst_yv12, cm->subsampling_x, cm->subsampling_y);
+
   // Special case for the first frame. Copy into the GF buffer as a second reference.
-  if (cm->current_video_frame == 0) {
+  if (cm->current_video_frame == 0)
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
-  }
 
-
   // use this to see what the first pass reconstruction looks like
   if (0) {
     char filename[512];
@@ -849,38 +899,28 @@
                                      double err_divisor,
                                      double pt_low,
                                      double pt_high,
-                                     int Q) {
-  double power_term;
-  double error_term = err_per_mb / err_divisor;
-  double correction_factor;
+                                     int q) {
+  const double error_term = err_per_mb / err_divisor;
 
   // Adjustment based on actual quantizer to power term.
-  power_term = (vp9_convert_qindex_to_q(Q) * 0.01) + pt_low;
-  power_term = (power_term > pt_high) ? pt_high : power_term;
+  const double power_term = MIN(vp9_convert_qindex_to_q(q) * 0.01 + pt_low,
+                                pt_high);
 
   // Calculate correction factor
   if (power_term < 1.0)
     assert(error_term >= 0.0);
-  correction_factor = pow(error_term, power_term);
 
-  // Clip range
-  correction_factor =
-    (correction_factor < 0.05)
-    ? 0.05 : (correction_factor > 5.0) ? 5.0 : correction_factor;
-
-  return correction_factor;
+  return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }
 
 // Given a current maxQ value sets a range for future values.
 // PGW TODO..
-// This code removes direct dependency on QIndex to determin the range
+// This code removes direct dependency on QIndex to determine the range
 // (now uses the actual quantizer) but has not been tuned.
 static void adjust_maxq_qrange(VP9_COMP *cpi) {
   int i;
-  double q;
-
   // Set the max corresponding to cpi->avg_q * 2.0
-  q = cpi->avg_q * 2.0;
+  double q = cpi->avg_q * 2.0;
   cpi->twopass.maxq_max_limit = cpi->worst_quality;
   for (i = cpi->best_quality; i <= cpi->worst_quality; i++) {
     cpi->twopass.maxq_max_limit = i;
@@ -901,12 +941,11 @@
 static int estimate_max_q(VP9_COMP *cpi,
                           FIRSTPASS_STATS *fpstats,
                           int section_target_bandwitdh) {
-  int Q;
+  int q;
   int num_mbs = cpi->common.MBs;
   int target_norm_bits_per_mb;
 
-  double section_err = (fpstats->coded_error / fpstats->count);
-  double sr_err_diff;
+  double section_err = fpstats->coded_error / fpstats->count;
   double sr_correction;
   double err_per_mb = section_err / num_mbs;
   double err_correction_factor;
@@ -915,23 +954,16 @@
   if (section_target_bandwitdh <= 0)
     return cpi->twopass.maxq_max_limit;          // Highest value allowed
 
-  target_norm_bits_per_mb =
-    (section_target_bandwitdh < (1 << 20))
-    ? (512 * section_target_bandwitdh) / num_mbs
-    : 512 * (section_target_bandwitdh / num_mbs);
+  target_norm_bits_per_mb = section_target_bandwitdh < (1 << 20)
+                              ? (512 * section_target_bandwitdh) / num_mbs
+                              : 512 * (section_target_bandwitdh / num_mbs);
 
   // Look at the drop in prediction quality between the last frame
   // and the GF buffer (which contained an older frame).
   if (fpstats->sr_coded_error > fpstats->coded_error) {
-    sr_err_diff =
-      (fpstats->sr_coded_error - fpstats->coded_error) /
-      (fpstats->count * cpi->common.MBs);
-    sr_correction = (sr_err_diff / 32.0);
-    sr_correction = pow(sr_correction, 0.25);
-    if (sr_correction < 0.75)
-      sr_correction = 0.75;
-    else if (sr_correction > 1.25)
-      sr_correction = 1.25;
+    double sr_err_diff = (fpstats->sr_coded_error - fpstats->coded_error) /
+                             (fpstats->count * cpi->common.MBs);
+    sr_correction = fclamp(pow(sr_err_diff / 32.0, 0.25), 0.75, 1.25);
   } else {
     sr_correction = 0.75;
   }
@@ -938,69 +970,58 @@
 
   // Calculate a corrective factor based on a rolling ratio of bits spent
   // vs target bits
-  if ((cpi->rolling_target_bits > 0) &&
-      (cpi->active_worst_quality < cpi->worst_quality)) {
-    double rolling_ratio;
+  if (cpi->rolling_target_bits > 0 &&
+      cpi->active_worst_quality < cpi->worst_quality) {
+    double rolling_ratio = (double)cpi->rolling_actual_bits /
+                               (double)cpi->rolling_target_bits;
 
-    rolling_ratio = (double)cpi->rolling_actual_bits /
-                    (double)cpi->rolling_target_bits;
-
     if (rolling_ratio < 0.95)
       cpi->twopass.est_max_qcorrection_factor -= 0.005;
     else if (rolling_ratio > 1.05)
       cpi->twopass.est_max_qcorrection_factor += 0.005;
 
-    cpi->twopass.est_max_qcorrection_factor =
-      (cpi->twopass.est_max_qcorrection_factor < 0.1)
-      ? 0.1
-      : (cpi->twopass.est_max_qcorrection_factor > 10.0)
-      ? 10.0 : cpi->twopass.est_max_qcorrection_factor;
+    cpi->twopass.est_max_qcorrection_factor = fclamp(
+        cpi->twopass.est_max_qcorrection_factor, 0.1, 10.0);
   }
 
   // Corrections for higher compression speed settings
   // (reduced compression expected)
-  if (cpi->compressor_speed == 1) {
-    if (cpi->oxcf.cpu_used <= 5)
-      speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
-    else
-      speed_correction = 1.25;
-  }
+  if (cpi->compressor_speed == 1)
+    speed_correction = cpi->oxcf.cpu_used <= 5 ?
+                          1.04 + (cpi->oxcf.cpu_used * 0.04) :
+                          1.25;
 
   // Try and pick a max Q that will be high enough to encode the
   // content at the given rate.
-  for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; Q++) {
+  for (q = cpi->twopass.maxq_min_limit; q < cpi->twopass.maxq_max_limit; q++) {
     int bits_per_mb_at_this_q;
 
-    err_correction_factor =
-      calc_correction_factor(err_per_mb, ERR_DIVISOR, 0.4, 0.90, Q) *
-      sr_correction * speed_correction *
-      cpi->twopass.est_max_qcorrection_factor;
+    err_correction_factor = calc_correction_factor(err_per_mb,
+                                                   ERR_DIVISOR, 0.4, 0.90, q) *
+                                sr_correction * speed_correction *
+                                cpi->twopass.est_max_qcorrection_factor;
 
+    bits_per_mb_at_this_q = vp9_bits_per_mb(INTER_FRAME, q,
+                                            err_correction_factor);
 
-    bits_per_mb_at_this_q =
-      vp9_bits_per_mb(INTER_FRAME, Q, err_correction_factor);
-
     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
       break;
   }
 
   // Restriction on active max q for constrained quality mode.
-  if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
-      (Q < cpi->cq_target_quality)) {
-    Q = cpi->cq_target_quality;
-  }
+  if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
+      q < cpi->cq_target_quality)
+    q = cpi->cq_target_quality;
 
   // Adjust maxq_min_limit and maxq_max_limit limits based on
-  // averaga q observed in clip for non kf/gf/arf frames
+  // average q observed in clip for non kf/gf/arf frames
   // Give average a chance to settle though.
   // PGW TODO.. This code is broken for the extended Q range
-  if ((cpi->ni_frames >
-       ((int)cpi->twopass.total_stats->count >> 8)) &&
-      (cpi->ni_frames > 25)) {
+  if (cpi->ni_frames > ((int)cpi->twopass.total_stats.count >> 8) &&
+      cpi->ni_frames > 25)
     adjust_maxq_qrange(cpi);
-  }
 
-  return Q;
+  return q;
 }
 
 // For cq mode estimate a cq level that matches the observed
@@ -1008,7 +1029,7 @@
 static int estimate_cq(VP9_COMP *cpi,
                        FIRSTPASS_STATS *fpstats,
                        int section_target_bandwitdh) {
-  int Q;
+  int q;
   int num_mbs = cpi->common.MBs;
   int target_norm_bits_per_mb;
 
@@ -1052,23 +1073,23 @@
   }
 
   // II ratio correction factor for clip as a whole
-  clip_iiratio = cpi->twopass.total_stats->intra_error /
-                 DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats->coded_error);
+  clip_iiratio = cpi->twopass.total_stats.intra_error /
+                 DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error);
   clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
   if (clip_iifactor < 0.80)
     clip_iifactor = 0.80;
 
   // Try and pick a Q that can encode the content at the given rate.
-  for (Q = 0; Q < MAXQ; Q++) {
+  for (q = 0; q < MAXQ; q++) {
     int bits_per_mb_at_this_q;
 
     // Error per MB based correction factor
     err_correction_factor =
-      calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, Q) *
+      calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, q) *
       sr_correction * speed_correction * clip_iifactor;
 
     bits_per_mb_at_this_q =
-      vp9_bits_per_mb(INTER_FRAME, Q, err_correction_factor);
+      vp9_bits_per_mb(INTER_FRAME, q, err_correction_factor);
 
     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
       break;
@@ -1075,13 +1096,13 @@
   }
 
   // Clip value to range "best allowed to (worst allowed - 1)"
-  Q = select_cq_level(Q);
-  if (Q >= cpi->worst_quality)
-    Q = cpi->worst_quality - 1;
-  if (Q < cpi->best_quality)
-    Q = cpi->best_quality;
+  q = select_cq_level(q);
+  if (q >= cpi->worst_quality)
+    q = cpi->worst_quality - 1;
+  if (q < cpi->best_quality)
+    q = cpi->best_quality;
 
-  return Q;
+  return q;
 }
 
 
@@ -1098,14 +1119,14 @@
   if (two_pass_min_rate < lower_bounds_min_rate)
     two_pass_min_rate = lower_bounds_min_rate;
 
-  zero_stats(cpi->twopass.total_stats);
-  zero_stats(cpi->twopass.total_left_stats);
+  zero_stats(&cpi->twopass.total_stats);
+  zero_stats(&cpi->twopass.total_left_stats);
 
   if (!cpi->twopass.stats_in_end)
     return;
 
-  *cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
-  *cpi->twopass.total_left_stats = *cpi->twopass.total_stats;
+  cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
+  cpi->twopass.total_left_stats = cpi->twopass.total_stats;
 
   // each frame can have a different duration, as the frame rate in the source
   // isn't guaranteed to be constant.   The frame rate prior to the first frame
@@ -1112,14 +1133,13 @@
   // encoded in the second pass is a guess.  However the sum duration is not.
   // Its calculated based on the actual durations of all frames from the first
   // pass.
-  vp9_new_frame_rate(cpi,
-                     10000000.0 * cpi->twopass.total_stats->count /
-                     cpi->twopass.total_stats->duration);
+  vp9_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count /
+                       cpi->twopass.total_stats.duration);
 
   cpi->output_frame_rate = cpi->oxcf.frame_rate;
-  cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats->duration *
+  cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *
                                      cpi->oxcf.target_bandwidth / 10000000.0);
-  cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats->duration *
+  cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration *
                                       two_pass_min_rate / 10000000.0);
 
   // Calculate a minimum intra value to be used in determining the IIratio
@@ -1145,7 +1165,8 @@
       sum_iiratio += IIRatio;
     }
 
-    cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats->count);
+    cpi->twopass.avg_iiratio = sum_iiratio /
+        DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats.count);
 
     // Reset file position
     reset_fpf_position(cpi, start_pos);
@@ -1185,9 +1206,8 @@
 
   // Look at the observed drop in prediction quality between the last frame
   // and the GF buffer (which contains an older frame).
-  mb_sr_err_diff =
-    (next_frame->sr_coded_error - next_frame->coded_error) /
-    (cpi->common.MBs);
+  mb_sr_err_diff = (next_frame->sr_coded_error - next_frame->coded_error) /
+                   cpi->common.MBs;
   if (mb_sr_err_diff <= 512.0) {
     second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);
     second_ref_decay = pow(second_ref_decay, 0.5);
@@ -1214,14 +1234,14 @@
   int still_interval,
   double loop_decay_rate,
   double last_decay_rate) {
-  int trans_to_still = FALSE;
+  int trans_to_still = 0;
 
   // Break clause to detect very still sections after motion
   // For example a static image after a fade or other transition
   // instead of a clean scene cut.
-  if ((frame_interval > MIN_GF_INTERVAL) &&
-      (loop_decay_rate >= 0.999) &&
-      (last_decay_rate < 0.9)) {
+  if (frame_interval > MIN_GF_INTERVAL &&
+      loop_decay_rate >= 0.999 &&
+      last_decay_rate < 0.9) {
     int j;
     FIRSTPASS_STATS *position = cpi->twopass.stats_in;
     FIRSTPASS_STATS tmp_next_frame;
@@ -1243,7 +1263,7 @@
 
     // Only if it does do we signal a transition to still
     if (j == still_interval)
-      trans_to_still = TRUE;
+      trans_to_still = 1;
   }
 
   return trans_to_still;
@@ -1255,7 +1275,7 @@
 static int detect_flash(VP9_COMP *cpi, int offset) {
   FIRSTPASS_STATS next_frame;
 
-  int flash_detected = FALSE;
+  int flash_detected = 0;
 
   // Read the frame data.
   // The return is FALSE (no flash detected) if not a valid frame
@@ -1265,10 +1285,9 @@
     // are reasonably well predicted by an earlier (pre flash) frame.
     // The recovery after a flash is indicated by a high pcnt_second_ref
     // comapred to pcnt_inter.
-    if ((next_frame.pcnt_second_ref > next_frame.pcnt_inter) &&
-        (next_frame.pcnt_second_ref >= 0.5)) {
-      flash_detected = TRUE;
-    }
+    if (next_frame.pcnt_second_ref > next_frame.pcnt_inter &&
+        next_frame.pcnt_second_ref >= 0.5)
+      flash_detected = 1;
   }
 
   return flash_detected;
@@ -1350,13 +1369,9 @@
   return frame_boost;
 }
 
-static int calc_arf_boost(
-  VP9_COMP *cpi,
-  int offset,
-  int f_frames,
-  int b_frames,
-  int *f_boost,
-  int *b_boost) {
+static int calc_arf_boost(VP9_COMP *cpi, int offset,
+                          int f_frames, int b_frames,
+                          int *f_boost, int *b_boost) {
   FIRSTPASS_STATS this_frame;
 
   int i;
@@ -1367,7 +1382,7 @@
   double mv_in_out_accumulator = 0.0;
   double abs_mv_in_out_accumulator = 0.0;
   int arf_boost;
-  int flash_detected = FALSE;
+  int flash_detected = 0;
 
   // Search forward from the proposed arf/next gf position
   for (i = 0; i < f_frames; i++) {
@@ -1379,7 +1394,7 @@
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
                                   &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
 
-    // We want to discount the the flash frame itself and the recovery
+    // We want to discount the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
     flash_detected = detect_flash(cpi, (i + offset)) ||
                      detect_flash(cpi, (i + offset + 1));
@@ -1386,8 +1401,7 @@
 
     // Cumulative effect of prediction quality decay
     if (!flash_detected) {
-      decay_accumulator =
-        decay_accumulator * get_prediction_decay_rate(cpi, &this_frame);
+      decay_accumulator *= get_prediction_decay_rate(cpi, &this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                           ? MIN_DECAY_FACTOR : decay_accumulator;
     }
@@ -1423,10 +1437,9 @@
 
     // Cumulative effect of prediction quality decay
     if (!flash_detected) {
-      decay_accumulator =
-        decay_accumulator * get_prediction_decay_rate(cpi, &this_frame);
+      decay_accumulator *= get_prediction_decay_rate(cpi, &this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
-                          ? MIN_DECAY_FACTOR : decay_accumulator;
+                              ? MIN_DECAY_FACTOR : decay_accumulator;
     }
 
     boost_score += (decay_accumulator *
@@ -1442,80 +1455,144 @@
   return arf_boost;
 }
 
-static void configure_arnr_filter(VP9_COMP *cpi,
-                                  FIRSTPASS_STATS *this_frame,
-                                  int group_boost) {
-  int half_gf_int;
-  int frames_after_arf;
-  int frames_bwd = cpi->oxcf.arnr_max_frames - 1;
-  int frames_fwd = cpi->oxcf.arnr_max_frames - 1;
-  int q;
+#if CONFIG_MULTIPLE_ARF
+// Work out the frame coding order for a GF or an ARF group.
+// The current implementation codes frames in their natural order for a
+// GF group, and inserts additional ARFs into an ARF group using a
+// binary split approach.
+// NOTE: this function is currently implemented recursively.
+static void schedule_frames(VP9_COMP *cpi, const int start, const int end,
+                            const int arf_idx, const int gf_or_arf_group,
+                            const int level) {
+  int i, abs_end, half_range;
+  int *cfo = cpi->frame_coding_order;
+  int idx = cpi->new_frame_coding_order_period;
 
-  // Define the arnr filter width for this group of frames:
-  // We only filter frames that lie within a distance of half
-  // the GF interval from the ARF frame. We also have to trap
-  // cases where the filter extends beyond the end of clip.
-  // Note: this_frame->frame has been updated in the loop
-  // so it now points at the ARF frame.
-  half_gf_int = cpi->baseline_gf_interval >> 1;
-  frames_after_arf = (int)(cpi->twopass.total_stats->count -
-                           this_frame->frame - 1);
+  // If (end < 0) an ARF should be coded at position (-end).
+  assert(start >= 0);
 
-  switch (cpi->oxcf.arnr_type) {
-    case 1: // Backward filter
-      frames_fwd = 0;
-      if (frames_bwd > half_gf_int)
-        frames_bwd = half_gf_int;
-      break;
+  // printf("start:%d end:%d\n", start, end);
 
-    case 2: // Forward filter
-      if (frames_fwd > half_gf_int)
-        frames_fwd = half_gf_int;
-      if (frames_fwd > frames_after_arf)
-        frames_fwd = frames_after_arf;
-      frames_bwd = 0;
-      break;
+  // GF Group: code frames in logical order.
+  if (gf_or_arf_group == 0) {
+    assert(end >= start);
+    for (i = start; i <= end; ++i) {
+      cfo[idx] = i;
+      cpi->arf_buffer_idx[idx] = arf_idx;
+      cpi->arf_weight[idx] = -1;
+      ++idx;
+    }
+    cpi->new_frame_coding_order_period = idx;
+    return;
+  }
 
-    case 3: // Centered filter
-    default:
-      frames_fwd >>= 1;
-      if (frames_fwd > frames_after_arf)
-        frames_fwd = frames_after_arf;
-      if (frames_fwd > half_gf_int)
-        frames_fwd = half_gf_int;
+  // ARF Group: work out the ARF schedule.
+  // Mark ARF frames as negative.
+  if (end < 0) {
+    // printf("start:%d end:%d\n", -end, -end);
+    // ARF frame is at the end of the range.
+    cfo[idx] = end;
+    // What ARF buffer does this ARF use as predictor.
+    cpi->arf_buffer_idx[idx] = (arf_idx > 2) ? (arf_idx - 1) : 2;
+    cpi->arf_weight[idx] = level;
+    ++idx;
+    abs_end = -end;
+  } else {
+    abs_end = end;
+  }
 
-      frames_bwd = frames_fwd;
+  half_range = (abs_end - start) >> 1;
 
-      // For even length filter there is one more frame backward
-      // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
-      if (frames_bwd < half_gf_int)
-        frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1;
-      break;
+  // ARFs may not be adjacent, they must be separated by at least
+  // MIN_GF_INTERVAL non-ARF frames.
+  if ((start + MIN_GF_INTERVAL) >= (abs_end - MIN_GF_INTERVAL)) {
+    // printf("start:%d end:%d\n", start, abs_end);
+    // Update the coding order and active ARF.
+    for (i = start; i <= abs_end; ++i) {
+      cfo[idx] = i;
+      cpi->arf_buffer_idx[idx] = arf_idx;
+      cpi->arf_weight[idx] = -1;
+      ++idx;
+    }
+    cpi->new_frame_coding_order_period = idx;
+  } else {
+    // Place a new ARF at the mid-point of the range.
+    cpi->new_frame_coding_order_period = idx;
+    schedule_frames(cpi, start, -(start + half_range), arf_idx + 1,
+                    gf_or_arf_group, level + 1);
+    schedule_frames(cpi, start + half_range + 1, abs_end, arf_idx,
+                    gf_or_arf_group, level + 1);
   }
+}
 
-  cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
+#define FIXED_ARF_GROUP_SIZE 16
 
-  // Adjust the strength based on active max q
-  q = ((int)vp9_convert_qindex_to_q(cpi->active_worst_quality) >> 1);
-  if (q > 8) {
-    cpi->active_arnr_strength = cpi->oxcf.arnr_strength;
+void define_fixed_arf_period(VP9_COMP *cpi) {
+  int i;
+  int max_level = INT_MIN;
+
+  assert(cpi->multi_arf_enabled);
+  assert(cpi->oxcf.lag_in_frames >= FIXED_ARF_GROUP_SIZE);
+
+  // Save the weight of the last frame in the sequence before next
+  // sequence pattern overwrites it.
+  cpi->this_frame_weight = cpi->arf_weight[cpi->sequence_number];
+  assert(cpi->this_frame_weight >= 0);
+
+  // Initialize frame coding order variables.
+  cpi->new_frame_coding_order_period = 0;
+  cpi->next_frame_in_order = 0;
+  cpi->arf_buffered = 0;
+  vp9_zero(cpi->frame_coding_order);
+  vp9_zero(cpi->arf_buffer_idx);
+  vpx_memset(cpi->arf_weight, -1, sizeof(cpi->arf_weight));
+
+  if (cpi->twopass.frames_to_key <= (FIXED_ARF_GROUP_SIZE + 8)) {
+    // Setup a GF group close to the keyframe.
+    cpi->source_alt_ref_pending = 0;
+    cpi->baseline_gf_interval = cpi->twopass.frames_to_key;
+    schedule_frames(cpi, 0, (cpi->baseline_gf_interval - 1), 2, 0, 0);
   } else {
-    cpi->active_arnr_strength = cpi->oxcf.arnr_strength - (8 - q);
-    if (cpi->active_arnr_strength < 0)
-      cpi->active_arnr_strength = 0;
+    // Setup a fixed period ARF group.
+    cpi->source_alt_ref_pending = 1;
+    cpi->baseline_gf_interval = FIXED_ARF_GROUP_SIZE;
+    schedule_frames(cpi, 0, -(cpi->baseline_gf_interval - 1), 2, 1, 0);
   }
 
-  // Adjust number of frames in filter and strength based on gf boost level.
-  if (cpi->active_arnr_frames > (group_boost / 150)) {
-    cpi->active_arnr_frames = (group_boost / 150);
-    cpi->active_arnr_frames += !(cpi->active_arnr_frames & 1);
+  // Replace level indicator of -1 with correct level.
+  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
+    if (cpi->arf_weight[i] > max_level) {
+      max_level = cpi->arf_weight[i];
+    }
   }
-  if (cpi->active_arnr_strength > (group_boost / 300)) {
-    cpi->active_arnr_strength = (group_boost / 300);
+  ++max_level;
+  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
+    if (cpi->arf_weight[i] == -1) {
+      cpi->arf_weight[i] = max_level;
+    }
   }
+  cpi->max_arf_level = max_level;
+#if 0
+  printf("\nSchedule: ");
+  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
+    printf("%4d ", cpi->frame_coding_order[i]);
+  }
+  printf("\n");
+  printf("ARFref:   ");
+  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
+    printf("%4d ", cpi->arf_buffer_idx[i]);
+  }
+  printf("\n");
+  printf("Weight:   ");
+  for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
+    printf("%4d ", cpi->arf_weight[i]);
+  }
+  printf("\n");
+#endif
 }
+#endif
 
-// Analyse and define a gf/arf group .
+// Analyse and define a gf/arf group.
 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   FIRSTPASS_STATS next_frame;
   FIRSTPASS_STATS *start_pos;
@@ -1619,10 +1696,10 @@
       }
 
       // Break clause to detect very still sections after motion
-      // (for example a staic image after a fade or other transition).
+      // (for example a static image after a fade or other transition).
       if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
                                      last_loop_decay_rate)) {
-        allow_alt_ref = FALSE;
+        allow_alt_ref = 0;
         break;
       }
     }
@@ -1637,9 +1714,9 @@
       // Break at cpi->max_gf_interval unless almost totally static
       (i >= active_max_gf_interval && (zero_motion_accumulator < 0.995)) ||
       (
-        // Dont break out with a very short interval
+        // Don't break out with a very short interval
         (i > MIN_GF_INTERVAL) &&
-        // Dont break out very close to a key frame
+        // Don't break out very close to a key frame
         ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&
         ((boost_score > 125.0) || (next_frame.pcnt_inter < 0.75)) &&
         (!flash_detected) &&
@@ -1652,12 +1729,12 @@
       break;
     }
 
-    vpx_memcpy(this_frame, &next_frame, sizeof(*this_frame));
+    *this_frame = next_frame;
 
     old_boost_score = boost_score;
   }
 
-  // Dont allow a gf too near the next kf
+  // Don't allow a gf too near the next kf
   if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) {
     while (i < cpi->twopass.frames_to_key) {
       i++;
@@ -1672,10 +1749,22 @@
     }
   }
 
-  // Set the interval till the next gf or arf.
+  // Set the interval until the next gf or arf.
   cpi->baseline_gf_interval = i;
 
-  // Should we use the alternate refernce frame
+#if CONFIG_MULTIPLE_ARF
+  if (cpi->multi_arf_enabled) {
+    // Initialize frame coding order variables.
+    cpi->new_frame_coding_order_period = 0;
+    cpi->next_frame_in_order = 0;
+    cpi->arf_buffered = 0;
+    vp9_zero(cpi->frame_coding_order);
+    vp9_zero(cpi->arf_buffer_idx);
+    vpx_memset(cpi->arf_weight, -1, sizeof(cpi->arf_weight));
+  }
+#endif
+
+  // Should we use the alternate reference frame
   if (allow_alt_ref &&
       (i < cpi->oxcf.lag_in_frames) &&
       (i >= MIN_GF_INTERVAL) &&
@@ -1686,16 +1775,66 @@
       ((mv_in_out_accumulator / (double)i > -0.2) ||
        (mv_in_out_accumulator > -2.0)) &&
       (boost_score > 100)) {
-    // Alterrnative boost calculation for alt ref
+    // Alternative boost calculation for alt ref
     cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
-    cpi->source_alt_ref_pending = TRUE;
+    cpi->source_alt_ref_pending = 1;
 
-    configure_arnr_filter(cpi, this_frame, cpi->gfu_boost);
+#if CONFIG_MULTIPLE_ARF
+    // Set the ARF schedule.
+    if (cpi->multi_arf_enabled) {
+      schedule_frames(cpi, 0, -(cpi->baseline_gf_interval - 1), 2, 1, 0);
+    }
+#endif
   } else {
     cpi->gfu_boost = (int)boost_score;
-    cpi->source_alt_ref_pending = FALSE;
+    cpi->source_alt_ref_pending = 0;
+#if CONFIG_MULTIPLE_ARF
+    // Set the GF schedule.
+    if (cpi->multi_arf_enabled) {
+      schedule_frames(cpi, 0, cpi->baseline_gf_interval - 1, 2, 0, 0);
+      assert(cpi->new_frame_coding_order_period == cpi->baseline_gf_interval);
+    }
+#endif
   }
 
+#if CONFIG_MULTIPLE_ARF
+  if (cpi->multi_arf_enabled && (cpi->common.frame_type != KEY_FRAME)) {
+    int max_level = INT_MIN;
+    // Replace level indicator of -1 with correct level.
+    for (i = 0; i < cpi->frame_coding_order_period; ++i) {
+      if (cpi->arf_weight[i] > max_level) {
+        max_level = cpi->arf_weight[i];
+      }
+    }
+    ++max_level;
+    for (i = 0; i < cpi->frame_coding_order_period; ++i) {
+      if (cpi->arf_weight[i] == -1) {
+        cpi->arf_weight[i] = max_level;
+      }
+    }
+    cpi->max_arf_level = max_level;
+  }
+#if 0
+  if (cpi->multi_arf_enabled) {
+    printf("\nSchedule: ");
+    for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
+      printf("%4d ", cpi->frame_coding_order[i]);
+    }
+    printf("\n");
+    printf("ARFref:   ");
+    for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
+      printf("%4d ", cpi->arf_buffer_idx[i]);
+    }
+    printf("\n");
+    printf("Weight:   ");
+    for (i = 0; i < cpi->new_frame_coding_order_period; ++i) {
+      printf("%4d ", cpi->arf_weight[i]);
+    }
+    printf("\n");
+  }
+#endif
+#endif
+
   // Now decide how many bits should be allocated to the GF group as  a
   // proportion of those remaining in the kf group.
   // The final key frame group in the clip is treated as a special case
@@ -1702,7 +1841,7 @@
   // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
   // This is also important for short clips where there may only be one
   // key frame.
-  if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats->count -
+  if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats.count -
                                           cpi->common.current_video_frame)) {
     cpi->twopass.kf_group_bits =
       (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;
@@ -1736,29 +1875,26 @@
   cpi->twopass.modified_error_used += gf_group_err;
 
   // Assign  bits to the arf or gf.
-  for (i = 0; i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME); i++) {
-    int boost;
+  for (i = 0;
+      i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME);
+      ++i) {
     int allocation_chunks;
-    int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+    int q = cpi->oxcf.fixed_q < 0 ? cpi->last_q[INTER_FRAME]
+                                  : cpi->oxcf.fixed_q;
     int gf_bits;
 
-    boost = (cpi->gfu_boost * vp9_gfboost_qadjust(Q)) / 100;
+    int boost = (cpi->gfu_boost * vp9_gfboost_qadjust(q)) / 100;
 
     // Set max and minimum boost and hence minimum allocation
-    if (boost > ((cpi->baseline_gf_interval + 1) * 200))
-      boost = ((cpi->baseline_gf_interval + 1) * 200);
-    else if (boost < 125)
-      boost = 125;
+    boost = clamp(boost, 125, (cpi->baseline_gf_interval + 1) * 200);
 
     if (cpi->source_alt_ref_pending && i == 0)
-      allocation_chunks =
-        ((cpi->baseline_gf_interval + 1) * 100) + boost;
+      allocation_chunks = ((cpi->baseline_gf_interval + 1) * 100) + boost;
     else
-      allocation_chunks =
-        (cpi->baseline_gf_interval * 100) + (boost - 100);
+      allocation_chunks = (cpi->baseline_gf_interval * 100) + (boost - 100);
 
     // Prevent overflow
-    if (boost > 1028) {
+    if (boost > 1023) {
       int divisor = boost >> 10;
       boost /= divisor;
       allocation_chunks /= divisor;
@@ -1766,41 +1902,34 @@
 
     // Calculate the number of bits to be spent on the gf or arf based on
     // the boost number
-    gf_bits = (int)((double)boost *
-                    (cpi->twopass.gf_group_bits /
-                     (double)allocation_chunks));
+    gf_bits = (int)((double)boost * (cpi->twopass.gf_group_bits /
+                                       (double)allocation_chunks));
 
     // If the frame that is to be boosted is simpler than the average for
     // the gf/arf group then use an alternative calculation
     // based on the error score of the frame itself
     if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval) {
-      double  alt_gf_grp_bits;
-      int     alt_gf_bits;
-
-      alt_gf_grp_bits =
+      double alt_gf_grp_bits =
         (double)cpi->twopass.kf_group_bits  *
         (mod_frame_err * (double)cpi->baseline_gf_interval) /
         DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left);
 
-      alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
+      int alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
                                            (double)allocation_chunks));
 
-      if (gf_bits > alt_gf_bits) {
+      if (gf_bits > alt_gf_bits)
         gf_bits = alt_gf_bits;
-      }
     }
     // Else if it is harder than other frames in the group make sure it at
     // least receives an allocation in keeping with its relative error
     // score, otherwise it may be worse off than an "un-boosted" frame
     else {
-      int alt_gf_bits =
-        (int)((double)cpi->twopass.kf_group_bits *
-              mod_frame_err /
-              DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left));
+      int alt_gf_bits = (int)((double)cpi->twopass.kf_group_bits *
+                        mod_frame_err /
+                        DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left));
 
-      if (alt_gf_bits > gf_bits) {
+      if (alt_gf_bits > gf_bits)
         gf_bits = alt_gf_bits;
-      }
     }
 
     // Dont allow a negative value for gf_bits
@@ -1807,18 +1936,21 @@
     if (gf_bits < 0)
       gf_bits = 0;
 
-    gf_bits += cpi->min_frame_bandwidth;                     // Add in minimum for a frame
+    // Add in minimum for a frame
+    gf_bits += cpi->min_frame_bandwidth;
 
     if (i == 0) {
       cpi->twopass.gf_bits = gf_bits;
     }
-    if (i == 1 || (!cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME))) {
-      cpi->per_frame_bandwidth = gf_bits;                 // Per frame bit target for this frame
+    if (i == 1 || (!cpi->source_alt_ref_pending
+        && (cpi->common.frame_type != KEY_FRAME))) {
+      // Per frame bit target for this frame
+      cpi->per_frame_bandwidth = gf_bits;
     }
   }
 
   {
-    // Adjust KF group bits and error remainin
+    // Adjust KF group bits and error remaining
     cpi->twopass.kf_group_error_left -= (int64_t)gf_group_err;
     cpi->twopass.kf_group_bits -= cpi->twopass.gf_group_bits;
 
@@ -1835,33 +1967,27 @@
     else
       cpi->twopass.gf_group_error_left = (int64_t)gf_group_err;
 
-    cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits - cpi->min_frame_bandwidth;
+    cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits
+        - cpi->min_frame_bandwidth;
 
     if (cpi->twopass.gf_group_bits < 0)
       cpi->twopass.gf_group_bits = 0;
 
     // This condition could fail if there are two kfs very close together
-    // despite (MIN_GF_INTERVAL) and would cause a devide by 0 in the
-    // calculation of cpi->twopass.alt_extra_bits.
+    // despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the
+    // calculation of alt_extra_bits.
     if (cpi->baseline_gf_interval >= 3) {
-      int boost = (cpi->source_alt_ref_pending)
-                  ? b_boost : cpi->gfu_boost;
+      const int boost = cpi->source_alt_ref_pending ? b_boost : cpi->gfu_boost;
 
       if (boost >= 150) {
-        int pct_extra;
-
-        pct_extra = (boost - 100) / 50;
+        int alt_extra_bits;
+        int pct_extra = (boost - 100) / 50;
         pct_extra = (pct_extra > 20) ? 20 : pct_extra;
 
-        cpi->twopass.alt_extra_bits = (int)
-          ((cpi->twopass.gf_group_bits * pct_extra) / 100);
-        cpi->twopass.gf_group_bits -= cpi->twopass.alt_extra_bits;
-        cpi->twopass.alt_extra_bits /=
-          ((cpi->baseline_gf_interval - 1) >> 1);
-      } else
-        cpi->twopass.alt_extra_bits = 0;
-    } else
-      cpi->twopass.alt_extra_bits = 0;
+        alt_extra_bits = (int)((cpi->twopass.gf_group_bits * pct_extra) / 100);
+        cpi->twopass.gf_group_bits -= alt_extra_bits;
+      }
+    }
   }
 
   if (cpi->common.frame_type != KEY_FRAME) {
@@ -1887,24 +2013,28 @@
 
 // Allocate bits to a normal frame that is neither a gf an arf or a key frame.
 static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  int    target_frame_size;                                                             // gf_group_error_left
+  int target_frame_size;
 
   double modified_err;
-  double err_fraction;                                                                 // What portion of the remaining GF group error is used by this frame
+  double err_fraction;
 
-  int max_bits = frame_max_bits(cpi);    // Max for a single frame
+  // Max for a single frame.
+  int max_bits = frame_max_bits(cpi);
 
-  // Calculate modified prediction error used in bit allocation
+  // Calculate modified prediction error used in bit allocation.
   modified_err = calculate_modified_err(cpi, this_frame);
 
   if (cpi->twopass.gf_group_error_left > 0)
-    err_fraction = modified_err / cpi->twopass.gf_group_error_left;                              // What portion of the remaining GF group error is used by this frame
+    // What portion of the remaining GF group error is used by this frame.
+    err_fraction = modified_err / cpi->twopass.gf_group_error_left;
   else
     err_fraction = 0.0;
 
-  target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction);                    // How many of those bits available for allocation should we give it?
+  // How many of those bits available for allocation should we give it?
+  target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction);
 
-  // Clip to target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at the top end.
+  // Clip target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at
+  // the top end.
   if (target_frame_size < 0)
     target_frame_size = 0;
   else {
@@ -1915,54 +2045,43 @@
       target_frame_size = (int)cpi->twopass.gf_group_bits;
   }
 
-  // Adjust error remaining
+  // Adjust error and bits remaining.
   cpi->twopass.gf_group_error_left -= (int64_t)modified_err;
-  cpi->twopass.gf_group_bits -= target_frame_size;                                                // Adjust bits remaining
+  cpi->twopass.gf_group_bits -= target_frame_size;
 
   if (cpi->twopass.gf_group_bits < 0)
     cpi->twopass.gf_group_bits = 0;
 
-  target_frame_size += cpi->min_frame_bandwidth;                                          // Add in the minimum number of bits that is set aside for every frame.
+  // Add in the minimum number of bits that is set aside for every frame.
+  target_frame_size += cpi->min_frame_bandwidth;
 
-
-  cpi->per_frame_bandwidth = target_frame_size;                                           // Per frame bit target for this frame
+  // Per frame bit target for this frame.
+  cpi->per_frame_bandwidth = target_frame_size;
 }
 
 // Make a damped adjustment to the active max q.
 static int adjust_active_maxq(int old_maxqi, int new_maxqi) {
   int i;
-  int ret_val = new_maxqi;
-  double old_q;
-  double new_q;
-  double target_q;
+  const double old_q = vp9_convert_qindex_to_q(old_maxqi);
+  const double new_q = vp9_convert_qindex_to_q(new_maxqi);
+  const double target_q = ((old_q * 7.0) + new_q) / 8.0;
 
-  old_q = vp9_convert_qindex_to_q(old_maxqi);
-  new_q = vp9_convert_qindex_to_q(new_maxqi);
-
-  target_q = ((old_q * 7.0) + new_q) / 8.0;
-
   if (target_q > old_q) {
-    for (i = old_maxqi; i <= new_maxqi; i++) {
-      if (vp9_convert_qindex_to_q(i) >= target_q) {
-        ret_val = i;
-        break;
-      }
-    }
+    for (i = old_maxqi; i <= new_maxqi; i++)
+      if (vp9_convert_qindex_to_q(i) >= target_q)
+        return i;
   } else {
-    for (i = old_maxqi; i >= new_maxqi; i--) {
-      if (vp9_convert_qindex_to_q(i) <= target_q) {
-        ret_val = i;
-        break;
-      }
-    }
+    for (i = old_maxqi; i >= new_maxqi; i--)
+      if (vp9_convert_qindex_to_q(i) <= target_q)
+        return i;
   }
 
-  return ret_val;
+  return new_maxqi;
 }
 
 void vp9_second_pass(VP9_COMP *cpi) {
   int tmp_q;
-  int frames_left = (int)(cpi->twopass.total_stats->count -
+  int frames_left = (int)(cpi->twopass.total_stats.count -
                           cpi->common.current_video_frame);
 
   FIRSTPASS_STATS this_frame;
@@ -1971,9 +2090,8 @@
   double this_frame_intra_error;
   double this_frame_coded_error;
 
-  if (!cpi->twopass.stats_in) {
+  if (!cpi->twopass.stats_in)
     return;
-  }
 
   vp9_clear_system_state();
 
@@ -1983,13 +2101,9 @@
 
     // Set a cq_level in constrained quality mode.
     if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-      int est_cq;
+      int est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats,
+                               (int)(cpi->twopass.bits_left / frames_left));
 
-      est_cq =
-        estimate_cq(cpi,
-                    cpi->twopass.total_left_stats,
-                    (int)(cpi->twopass.bits_left / frames_left));
-
       cpi->cq_target_quality = cpi->oxcf.cq_level;
       if (est_cq > cpi->cq_target_quality)
         cpi->cq_target_quality = est_cq;
@@ -1999,14 +2113,12 @@
     cpi->twopass.maxq_max_limit = cpi->worst_quality;
     cpi->twopass.maxq_min_limit = cpi->best_quality;
 
-    tmp_q = estimate_max_q(
-              cpi,
-              cpi->twopass.total_left_stats,
-              (int)(cpi->twopass.bits_left / frames_left));
+    tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats,
+                           (int)(cpi->twopass.bits_left / frames_left));
 
-    cpi->active_worst_quality         = tmp_q;
-    cpi->ni_av_qi                     = tmp_q;
-    cpi->avg_q                        = vp9_convert_qindex_to_q(tmp_q);
+    cpi->active_worst_quality = tmp_q;
+    cpi->ni_av_qi = tmp_q;
+    cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
 
 #ifndef ONE_SHOT_Q_ESTIMATE
     // Limit the maxq value returned subsequently.
@@ -2024,15 +2136,15 @@
   // radical adjustments to the allowed quantizer range just to use up a
   // few surplus bits or get beneath the target rate.
   else if ((cpi->common.current_video_frame <
-            (((unsigned int)cpi->twopass.total_stats->count * 255) >> 8)) &&
+            (((unsigned int)cpi->twopass.total_stats.count * 255) >> 8)) &&
            ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
-            (unsigned int)cpi->twopass.total_stats->count)) {
+            (unsigned int)cpi->twopass.total_stats.count)) {
     if (frames_left < 1)
       frames_left = 1;
 
     tmp_q = estimate_max_q(
               cpi,
-              cpi->twopass.total_left_stats,
+              &cpi->twopass.total_left_stats,
               (int)(cpi->twopass.bits_left / frames_left));
 
     // Make a damped adjustment to active max Q
@@ -2051,7 +2163,7 @@
   // keyframe and section processing !
   if (cpi->twopass.frames_to_key == 0) {
     // Define next KF group and assign bits to it
-    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+    this_frame_copy = this_frame;
     find_next_key_frame(cpi, &this_frame_copy);
   }
 
@@ -2058,9 +2170,18 @@
   // Is this a GF / ARF (Note that a KF is always also a GF)
   if (cpi->frames_till_gf_update_due == 0) {
     // Define next gf group and assign bits to it
-    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
-    define_gf_group(cpi, &this_frame_copy);
+    this_frame_copy = this_frame;
 
+#if CONFIG_MULTIPLE_ARF
+    if (cpi->multi_arf_enabled) {
+      define_fixed_arf_period(cpi);
+    } else {
+#endif
+      define_gf_group(cpi, &this_frame_copy);
+#if CONFIG_MULTIPLE_ARF
+    }
+#endif
+
     // If we are going to code an altref frame at the end of the group
     // and the current frame is not a key frame....
     // If the previous group used an arf this frame has already benefited
@@ -2071,7 +2192,7 @@
       // Assign a standard frames worth of bits from those allocated
       // to the GF group
       int bak = cpi->per_frame_bandwidth;
-      vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+      this_frame_copy = this_frame;
       assign_std_frame_bits(cpi, &this_frame_copy);
       cpi->per_frame_bandwidth = bak;
     }
@@ -2078,7 +2199,7 @@
   } else {
     // Otherwise this is an ordinary frame
     // Assign bits from those allocated to the GF group
-    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+    this_frame_copy =  this_frame;
     assign_std_frame_bits(cpi, &this_frame_copy);
   }
 
@@ -2101,8 +2222,8 @@
 
   cpi->twopass.frames_to_key--;
 
-  // Update the total stats remaining sturcture
-  subtract_stats(cpi->twopass.total_left_stats, &this_frame);
+  // Update the total stats remaining structure
+  subtract_stats(&cpi->twopass.total_left_stats, &this_frame);
 }
 
 static int test_candidate_kf(VP9_COMP *cpi,
@@ -2109,7 +2230,7 @@
                              FIRSTPASS_STATS *last_frame,
                              FIRSTPASS_STATS *this_frame,
                              FIRSTPASS_STATS *next_frame) {
-  int is_viable_kf = FALSE;
+  int is_viable_kf = 0;
 
   // Does the frame satisfy the primary criteria of a key frame
   //      If so, then examine how well it predicts subsequent frames
@@ -2136,7 +2257,7 @@
     double decay_accumulator = 1.0;
     double next_iiratio;
 
-    vpx_memcpy(&local_next_frame, next_frame, sizeof(*next_frame));
+    local_next_frame = *next_frame;
 
     // Note the starting file position so we can reset to it
     start_pos = cpi->twopass.stats_in;
@@ -2178,14 +2299,15 @@
         break;
     }
 
-    // If there is tolerable prediction for at least the next 3 frames then break out else discard this pottential key frame and move on
+    // If there is tolerable prediction for at least the next 3 frames then
+    // break out else discard this potential key frame and move on
     if (boost_score > 30.0 && (i > 3))
-      is_viable_kf = TRUE;
+      is_viable_kf = 1;
     else {
       // Reset the file position
       reset_fpf_position(cpi, start_pos);
 
-      is_viable_kf = FALSE;
+      is_viable_kf = 0;
     }
   }
 
@@ -2201,7 +2323,6 @@
   double decay_accumulator = 1.0;
   double zero_motion_accumulator = 1.0;
   double boost_score = 0;
-  double old_boost_score = 0.0;
   double loop_decay_rate;
 
   double kf_mod_err = 0.0;
@@ -2221,7 +2342,7 @@
   cpi->this_key_frame_forced = cpi->next_key_frame_forced;
 
   // Clear the alt ref active flag as this can never be active on a key frame
-  cpi->source_alt_ref_active = FALSE;
+  cpi->source_alt_ref_active = 0;
 
   // Kf is always a gf so clear frames till next gf counter
   cpi->frames_till_gf_update_due = 0;
@@ -2229,9 +2350,9 @@
   cpi->twopass.frames_to_key = 1;
 
   // Take a copy of the initial frame details
-  vpx_memcpy(&first_frame, this_frame, sizeof(*this_frame));
+  first_frame = *this_frame;
 
-  cpi->twopass.kf_group_bits = 0;        // Total bits avaialable to kf group
+  cpi->twopass.kf_group_bits = 0;        // Total bits available to kf group
   cpi->twopass.kf_group_error_left = 0;  // Group modified error score.
 
   kf_mod_err = calculate_modified_err(cpi, this_frame);
@@ -2248,7 +2369,7 @@
     kf_group_coded_err += this_frame->coded_error;
 
     // load a the next frame's stats
-    vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));
+    last_frame = *this_frame;
     input_stats(cpi, this_frame);
 
     // Provided that we are not at the end of the file...
@@ -2255,10 +2376,10 @@
     if (cpi->oxcf.auto_key
         && lookup_next_frame_stats(cpi, &next_frame) != EOF) {
       // Normal scene cut check
-      if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame)) {
+      if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame))
         break;
-      }
 
+
       // How fast is prediction quality decaying
       loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
 
@@ -2267,20 +2388,15 @@
       // quality since the last GF or KF.
       recent_loop_decay[i % 8] = loop_decay_rate;
       decay_accumulator = 1.0;
-      for (j = 0; j < 8; j++) {
-        decay_accumulator = decay_accumulator * recent_loop_decay[j];
-      }
+      for (j = 0; j < 8; j++)
+        decay_accumulator *= recent_loop_decay[j];
 
       // Special check for transition or high motion followed by a
       // to a static scene.
-      if (detect_transition_to_still(cpi, i,
-                                     (cpi->key_frame_frequency - i),
-                                     loop_decay_rate,
-                                     decay_accumulator)) {
+      if (detect_transition_to_still(cpi, i, cpi->key_frame_frequency - i,
+                                     loop_decay_rate, decay_accumulator))
         break;
-      }
 
-
       // Step on to the next frame
       cpi->twopass.frames_to_key++;
 
@@ -2306,7 +2422,7 @@
     cpi->twopass.frames_to_key /= 2;
 
     // Copy first frame details
-    vpx_memcpy(&tmp_frame, &first_frame, sizeof(first_frame));
+    tmp_frame = first_frame;
 
     // Reset to the start of the group
     reset_fpf_position(cpi, start_position);
@@ -2329,9 +2445,9 @@
     // Reset to the start of the group
     reset_fpf_position(cpi, current_pos);
 
-    cpi->next_key_frame_forced = TRUE;
+    cpi->next_key_frame_forced = 1;
   } else
-    cpi->next_key_frame_forced = FALSE;
+    cpi->next_key_frame_forced = 0;
 
   // Special case for the last frame of the file
   if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) {
@@ -2373,6 +2489,7 @@
   boost_score = 0.0;
   loop_decay_rate = 1.00;       // Starting decay rate
 
+  // Scan through the kf group collating various stats.
   for (i = 0; i < cpi->twopass.frames_to_key; i++) {
     double r;
 
@@ -2379,16 +2496,6 @@
     if (EOF == input_stats(cpi, &next_frame))
       break;
 
-    if (next_frame.intra_error > cpi->twopass.kf_intra_err_min)
-      r = (IIKFACTOR2 * next_frame.intra_error /
-           DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
-    else
-      r = (IIKFACTOR2 * cpi->twopass.kf_intra_err_min /
-           DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
-
-    if (r > RMAX)
-      r = RMAX;
-
     // Monitor for static sections.
     if ((next_frame.pcnt_inter - next_frame.pcnt_motion) <
         zero_motion_accumulator) {
@@ -2396,22 +2503,28 @@
         (next_frame.pcnt_inter - next_frame.pcnt_motion);
     }
 
-    // How fast is prediction quality decaying
-    if (!detect_flash(cpi, 0)) {
-      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
-      decay_accumulator = decay_accumulator * loop_decay_rate;
-      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
-                            ? MIN_DECAY_FACTOR : decay_accumulator;
-    }
+    // For the first few frames collect data to decide kf boost.
+    if (i <= (cpi->max_gf_interval * 2)) {
+      if (next_frame.intra_error > cpi->twopass.kf_intra_err_min)
+        r = (IIKFACTOR2 * next_frame.intra_error /
+             DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
+      else
+        r = (IIKFACTOR2 * cpi->twopass.kf_intra_err_min /
+             DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
 
-    boost_score += (decay_accumulator * r);
+      if (r > RMAX)
+        r = RMAX;
 
-    if ((i > MIN_GF_INTERVAL) &&
-        ((boost_score - old_boost_score) < 6.25)) {
-      break;
-    }
+      // How fast is prediction quality decaying
+      if (!detect_flash(cpi, 0)) {
+        loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+        decay_accumulator = decay_accumulator * loop_decay_rate;
+        decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+                              ? MIN_DECAY_FACTOR : decay_accumulator;
+      }
 
-    old_boost_score = boost_score;
+      boost_score += (decay_accumulator * r);
+    }
   }
 
   {
@@ -2441,8 +2554,8 @@
     int allocation_chunks;
     int alt_kf_bits;
 
-    if (kf_boost < (cpi->twopass.frames_to_key * 5))
-      kf_boost = (cpi->twopass.frames_to_key * 5);
+    if (kf_boost < (cpi->twopass.frames_to_key * 3))
+      kf_boost = (cpi->twopass.frames_to_key * 3);
 
     if (kf_boost < 300) // Min KF boost
       kf_boost = 300;
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -46,7 +46,7 @@
       unsigned int i;
 
       for (i = 0; i < ctx->max_sz; i++)
-        vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img);
+        vp9_free_frame_buffer(&ctx->buf[i].img);
       free(ctx->buf);
     }
     free(ctx);
@@ -56,6 +56,8 @@
 
 struct lookahead_ctx * vp9_lookahead_init(unsigned int width,
                                           unsigned int height,
+                                          unsigned int subsampling_x,
+                                          unsigned int subsampling_y,
                                           unsigned int depth) {
   struct lookahead_ctx *ctx = NULL;
 
@@ -71,8 +73,9 @@
     if (!ctx->buf)
       goto bail;
     for (i = 0; i < depth; i++)
-      if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img,
-                                      width, height, VP9BORDERINPIXELS))
+      if (vp9_alloc_frame_buffer(&ctx->buf[i].img,
+                                 width, height, subsampling_x, subsampling_y,
+                                 VP9BORDERINPIXELS))
         goto bail;
   }
   return ctx;
@@ -81,14 +84,17 @@
   return NULL;
 }
 
+#define USE_PARTIAL_COPY 0
 
 int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG   *src,
                        int64_t ts_start, int64_t ts_end, unsigned int flags,
                        unsigned char *active_map) {
   struct lookahead_entry *buf;
+#if USE_PARTIAL_COPY
   int row, col, active_end;
   int mb_rows = (src->y_height + 15) >> 4;
   int mb_cols = (src->y_width + 15) >> 4;
+#endif
 
   if (ctx->sz + 1 > ctx->max_sz)
     return 1;
@@ -95,6 +101,10 @@
   ctx->sz++;
   buf = pop(ctx, &ctx->write_idx);
 
+#if USE_PARTIAL_COPY
+  // TODO(jkoleszar): This is disabled for now, as
+  // vp9_copy_and_extend_frame_with_rect is not subsampling/alpha aware.
+
   // Only do this partial copy if the following conditions are all met:
   // 1. Lookahead queue has has size of 1.
   // 2. Active map is provided.
@@ -137,6 +147,11 @@
   } else {
     vp9_copy_and_extend_frame(src, &buf->img);
   }
+#else
+  // Partial copy not implemented yet
+  vp9_copy_and_extend_frame(src, &buf->img);
+#endif
+
   buf->ts_start = ts_start;
   buf->ts_end = ts_end;
   buf->flags = flags;
--- a/vp9/encoder/vp9_lookahead.h
+++ b/vp9/encoder/vp9_lookahead.h
@@ -31,6 +31,8 @@
  */
 struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
                                          unsigned int height,
+                                         unsigned int subsampling_x,
+                                         unsigned int subsampling_y,
                                          unsigned int depth);
 
 
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -9,13 +9,13 @@
  */
 
 #include <limits.h>
+
+#include <vpx_mem/vpx_mem.h>
 #include <vp9/encoder/vp9_encodeintra.h>
 #include <vp9/encoder/vp9_rdopt.h>
-#include <vp9/common/vp9_setupintrarecon.h>
 #include <vp9/common/vp9_blockd.h>
 #include <vp9/common/vp9_reconinter.h>
 #include <vp9/common/vp9_systemdependent.h>
-#include <vpx_mem/vpx_mem.h>
 #include <vp9/encoder/vp9_segmentation.h>
 
 static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
@@ -25,21 +25,18 @@
                                               int mb_col) {
   MACROBLOCK   *const x  = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  BLOCK *b  = &x->block[0];
-  BLOCKD *d = &xd->block[0];
   vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
   unsigned int best_err;
 
-
-  int tmp_col_min = x->mv_col_min;
-  int tmp_col_max = x->mv_col_max;
-  int tmp_row_min = x->mv_row_min;
-  int tmp_row_max = x->mv_row_max;
+  const int tmp_col_min = x->mv_col_min;
+  const int tmp_col_max = x->mv_col_max;
+  const int tmp_row_min = x->mv_row_min;
+  const int tmp_row_max = x->mv_row_max;
   int_mv ref_full;
 
   // Further step/diamond searches as necessary
   int step_param = cpi->sf.first_step +
-      (cpi->Speed < 8 ? (cpi->Speed > 5 ? 1 : 0) : 2);
+      (cpi->speed < 8 ? (cpi->speed > 5 ? 1 : 0) : 2);
 
   vp9_clamp_mv_min_max(x, ref_mv);
 
@@ -47,15 +44,8 @@
   ref_full.as_mv.row = ref_mv->as_mv.row >> 3;
 
   /*cpi->sf.search_method == HEX*/
-  best_err = vp9_hex_search(
-      x, b, d,
-      &ref_full, dst_mv,
-      step_param,
-      x->errorperbit,
-      &v_fn_ptr,
-      NULL, NULL,
-      NULL, NULL,
-      ref_mv);
+  best_err = vp9_hex_search(x, &ref_full, dst_mv, step_param, x->errorperbit,
+                            &v_fn_ptr, NULL, NULL, NULL, NULL, ref_mv);
 
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
@@ -63,7 +53,7 @@
     int distortion;
     unsigned int sse;
     best_err = cpi->find_fractional_mv_step(
-        x, b, d,
+        x,
         dst_mv, ref_mv,
         x->errorperbit, &v_fn_ptr,
         NULL, NULL,
@@ -71,9 +61,10 @@
   }
 
   vp9_set_mbmode_and_mvs(x, NEWMV, dst_mv);
-  vp9_build_inter16x16_predictors_mby(xd, xd->predictor, 16, mb_row, mb_col);
-  best_err = vp9_sad16x16(xd->dst.y_buffer, xd->dst.y_stride,
-                          xd->predictor, 16, INT_MAX);
+  vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_SIZE_MB16X16);
+  best_err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                          xd->plane[0].dst.buf, xd->plane[0].dst.stride,
+                          INT_MAX);
 
   /* restore UMV window */
   x->mv_col_min = tmp_col_min;
@@ -84,42 +75,20 @@
   return best_err;
 }
 
-static int do_16x16_motion_search
-(
-  VP9_COMP *cpi,
-  int_mv *ref_mv,
-  int_mv *dst_mv,
-  YV12_BUFFER_CONFIG *buf,
-  int buf_mb_y_offset,
-  YV12_BUFFER_CONFIG *ref,
-  int mb_y_offset,
-  int mb_row,
-  int mb_col) {
-  MACROBLOCK   *const x  = &cpi->mb;
+static int do_16x16_motion_search(VP9_COMP *cpi,
+                                  int_mv *ref_mv, int_mv *dst_mv,
+                                  int buf_mb_y_offset, int mb_y_offset,
+                                  int mb_row, int mb_col) {
+  MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   unsigned int err, tmp_err;
   int_mv tmp_mv;
-  int n;
 
-  for (n = 0; n < 16; n++) {
-    BLOCKD *d = &xd->block[n];
-    BLOCK *b  = &x->block[n];
-
-    b->base_src   = &buf->y_buffer;
-    b->src_stride = buf->y_stride;
-    b->src        = buf->y_stride * (n & 12) + (n & 3) * 4 + buf_mb_y_offset;
-
-    d->base_pre   = &ref->y_buffer;
-    d->pre_stride = ref->y_stride;
-    d->pre        = ref->y_stride * (n & 12) + (n & 3) * 4 + mb_y_offset;
-  }
-
   // Try zero MV first
   // FIXME should really use something like near/nearest MV and/or MV prediction
-  xd->pre.y_buffer = ref->y_buffer + mb_y_offset;
-  xd->pre.y_stride = ref->y_stride;
-  err = vp9_sad16x16(ref->y_buffer + mb_y_offset, ref->y_stride,
-                     xd->dst.y_buffer, xd->dst.y_stride, INT_MAX);
+  err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
+                     INT_MAX);
   dst_mv->as_int = 0;
 
   // Test last reference frame using the previous best mv as the
@@ -126,7 +95,7 @@
   // starting point (best reference) for the search
   tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv, mb_row, mb_col);
   if (tmp_err < err) {
-    err            = tmp_err;
+    err = tmp_err;
     dst_mv->as_int = tmp_mv.as_int;
   }
 
@@ -147,51 +116,26 @@
   return err;
 }
 
-static int do_16x16_zerozero_search
-(
-  VP9_COMP *cpi,
-  int_mv *dst_mv,
-  YV12_BUFFER_CONFIG *buf,
-  int buf_mb_y_offset,
-  YV12_BUFFER_CONFIG *ref,
-  int mb_y_offset
-) {
-  MACROBLOCK   *const x  = &cpi->mb;
+static int do_16x16_zerozero_search(VP9_COMP *cpi,
+                                    int_mv *dst_mv,
+                                    int buf_mb_y_offset, int mb_y_offset) {
+  MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   unsigned int err;
-  int n;
 
-  for (n = 0; n < 16; n++) {
-    BLOCKD *d = &xd->block[n];
-    BLOCK *b  = &x->block[n];
-
-    b->base_src   = &buf->y_buffer;
-    b->src_stride = buf->y_stride;
-    b->src        = buf->y_stride * (n & 12) + (n & 3) * 4 + buf_mb_y_offset;
-
-    d->base_pre   = &ref->y_buffer;
-    d->pre_stride = ref->y_stride;
-    d->pre        = ref->y_stride * (n & 12) + (n & 3) * 4 + mb_y_offset;
-  }
-
   // Try zero MV first
   // FIXME should really use something like near/nearest MV and/or MV prediction
-  xd->pre.y_buffer = ref->y_buffer + mb_y_offset;
-  xd->pre.y_stride = ref->y_stride;
-  err = vp9_sad16x16(ref->y_buffer + mb_y_offset, ref->y_stride,
-                     xd->dst.y_buffer, xd->dst.y_stride, INT_MAX);
+  err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
+                     INT_MAX);
 
   dst_mv->as_int = 0;
 
   return err;
 }
-static int find_best_16x16_intra
-(
-  VP9_COMP *cpi,
-  YV12_BUFFER_CONFIG *buf,
-  int mb_y_offset,
-  MB_PREDICTION_MODE *pbest_mode
-) {
+static int find_best_16x16_intra(VP9_COMP *cpi,
+                                 int mb_y_offset,
+                                 MB_PREDICTION_MODE *pbest_mode) {
   MACROBLOCK   *const x  = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_PREDICTION_MODE best_mode = -1, mode;
@@ -201,11 +145,19 @@
   // we're intentionally not doing 4x4, we just want a rough estimate
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     unsigned int err;
+    const int bwl = b_width_log2(BLOCK_SIZE_MB16X16),  bw = 4 << bwl;
+    const int bhl = b_height_log2(BLOCK_SIZE_MB16X16), bh = 4 << bhl;
 
     xd->mode_info_context->mbmi.mode = mode;
-    vp9_build_intra_predictors_mby(xd);
-    err = vp9_sad16x16(xd->predictor, 16, buf->y_buffer + mb_y_offset,
-                       buf->y_stride, best_err);
+    vp9_build_intra_predictors(x->plane[0].src.buf, x->plane[0].src.stride,
+                               xd->plane[0].dst.buf, xd->plane[0].dst.stride,
+                               xd->mode_info_context->mbmi.mode,
+                               bw, bh,
+                               xd->up_available, xd->left_available,
+                               xd->right_available);
+    err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                       xd->plane[0].dst.buf, xd->plane[0].dst.stride, best_err);
+
     // find best
     if (err < best_err) {
       best_err  = err;
@@ -234,15 +186,21 @@
   int mb_row,
   int mb_col
 ) {
-  MACROBLOCK   *const x  = &cpi->mb;
+  MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   int intra_error;
+  VP9_COMMON *cm = &cpi->common;
 
   // FIXME in practice we're completely ignoring chroma here
-  xd->dst.y_buffer = buf->y_buffer + mb_y_offset;
+  x->plane[0].src.buf = buf->y_buffer + mb_y_offset;
+  x->plane[0].src.stride = buf->y_stride;
 
+  xd->plane[0].dst.buf = cm->yv12_fb[cm->new_fb_idx].y_buffer + mb_y_offset;
+  xd->plane[0].dst.stride = cm->yv12_fb[cm->new_fb_idx].y_stride;
+
   // do intra 16x16 prediction
-  intra_error = find_best_16x16_intra(cpi, buf, mb_y_offset, &stats->ref[INTRA_FRAME].m.mode);
+  intra_error = find_best_16x16_intra(cpi, mb_y_offset,
+                                      &stats->ref[INTRA_FRAME].m.mode);
   if (intra_error <= 0)
     intra_error = 1;
   stats->ref[INTRA_FRAME].err = intra_error;
@@ -249,11 +207,14 @@
 
   // Golden frame MV search, if it exists and is different than last frame
   if (golden_ref) {
-    int g_motion_error = do_16x16_motion_search(cpi, prev_golden_ref_mv,
-                                                &stats->ref[GOLDEN_FRAME].m.mv,
-                                                buf, mb_y_offset,
-                                                golden_ref, gld_y_offset,
-                                                mb_row, mb_col);
+    int g_motion_error;
+    xd->plane[0].pre[0].buf = golden_ref->y_buffer + mb_y_offset;
+    xd->plane[0].pre[0].stride = golden_ref->y_stride;
+    g_motion_error = do_16x16_motion_search(cpi,
+                                            prev_golden_ref_mv,
+                                            &stats->ref[GOLDEN_FRAME].m.mv,
+                                            mb_y_offset, gld_y_offset,
+                                            mb_row, mb_col);
     stats->ref[GOLDEN_FRAME].err = g_motion_error;
   } else {
     stats->ref[GOLDEN_FRAME].err = INT_MAX;
@@ -262,17 +223,13 @@
 
   // Alt-ref frame MV search, if it exists and is different than last/golden frame
   if (alt_ref) {
-    // int a_motion_error = do_16x16_motion_search(cpi, prev_alt_ref_mv,
-    //                                            &stats->ref[ALTREF_FRAME].m.mv,
-    //                                            buf, mb_y_offset,
-    //                                            alt_ref, arf_y_offset);
+    int a_motion_error;
+    xd->plane[0].pre[0].buf = alt_ref->y_buffer + mb_y_offset;
+    xd->plane[0].pre[0].stride = alt_ref->y_stride;
+    a_motion_error = do_16x16_zerozero_search(cpi,
+                                              &stats->ref[ALTREF_FRAME].m.mv,
+                                              mb_y_offset, arf_y_offset);
 
-    int a_motion_error =
-      do_16x16_zerozero_search(cpi,
-                               &stats->ref[ALTREF_FRAME].m.mv,
-                               buf, mb_y_offset,
-                               alt_ref, arf_y_offset);
-
     stats->ref[ALTREF_FRAME].err = a_motion_error;
   } else {
     stats->ref[ALTREF_FRAME].err = INT_MAX;
@@ -280,17 +237,15 @@
   }
 }
 
-static void update_mbgraph_frame_stats
-(
-  VP9_COMP *cpi,
-  MBGRAPH_FRAME_STATS *stats,
-  YV12_BUFFER_CONFIG *buf,
-  YV12_BUFFER_CONFIG *golden_ref,
-  YV12_BUFFER_CONFIG *alt_ref
-) {
-  MACROBLOCK   *const x  = &cpi->mb;
-  VP9_COMMON   *const cm = &cpi->common;
+static void update_mbgraph_frame_stats(VP9_COMP *cpi,
+                                       MBGRAPH_FRAME_STATS *stats,
+                                       YV12_BUFFER_CONFIG *buf,
+                                       YV12_BUFFER_CONFIG *golden_ref,
+                                       YV12_BUFFER_CONFIG *alt_ref) {
+  MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
+  VP9_COMMON *const cm = &cpi->common;
+
   int mb_col, mb_row, offset = 0;
   int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
   int_mv arf_top_mv, gld_top_mv;
@@ -302,14 +257,17 @@
   // Set up limit values for motion vectors to prevent them extending outside the UMV borders
   arf_top_mv.as_int = 0;
   gld_top_mv.as_int = 0;
-  x->mv_row_min     = -(VP9BORDERINPIXELS - 16 - VP9_INTERP_EXTEND);
-  x->mv_row_max     = (cm->mb_rows - 1) * 16 + VP9BORDERINPIXELS
-                      - 16 - VP9_INTERP_EXTEND;
+  x->mv_row_min     = -(VP9BORDERINPIXELS - 8 - VP9_INTERP_EXTEND);
+  x->mv_row_max     = (cm->mb_rows - 1) * 8 + VP9BORDERINPIXELS
+                      - 8 - VP9_INTERP_EXTEND;
   xd->up_available  = 0;
-  xd->dst.y_stride  = buf->y_stride;
-  xd->pre.y_stride  = buf->y_stride;
-  xd->dst.uv_stride = buf->uv_stride;
+  xd->plane[0].dst.stride  = buf->y_stride;
+  xd->plane[0].pre[0].stride  = buf->y_stride;
+  xd->plane[1].dst.stride = buf->uv_stride;
   xd->mode_info_context = &mi_local;
+  mi_local.mbmi.sb_type = BLOCK_SIZE_MB16X16;
+  mi_local.mbmi.ref_frame[0] = LAST_FRAME;
+  mi_local.mbmi.ref_frame[1] = NONE;
 
   for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
     int_mv arf_left_mv, gld_left_mv;
@@ -320,9 +278,9 @@
     // Set up limit values for motion vectors to prevent them extending outside the UMV borders
     arf_left_mv.as_int = arf_top_mv.as_int;
     gld_left_mv.as_int = gld_top_mv.as_int;
-    x->mv_col_min      = -(VP9BORDERINPIXELS - 16 - VP9_INTERP_EXTEND);
-    x->mv_col_max      = (cm->mb_cols - 1) * 16 + VP9BORDERINPIXELS
-                         - 16 - VP9_INTERP_EXTEND;
+    x->mv_col_min      = -(VP9BORDERINPIXELS - 8 - VP9_INTERP_EXTEND);
+    x->mv_col_max      = (cm->mb_cols - 1) * 8 + VP9BORDERINPIXELS
+                         - 8 - VP9_INTERP_EXTEND;
     xd->left_available = 0;
 
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
@@ -379,8 +337,7 @@
     for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
          offset += cm->mb_cols, mb_row++) {
       for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-        MBGRAPH_MB_STATS *mb_stats =
-          &frame_stats->mb_stats[offset + mb_col];
+        MBGRAPH_MB_STATS *mb_stats = &frame_stats->mb_stats[offset + mb_col];
 
         int altref_err = mb_stats->ref[ALTREF_FRAME].err;
         int intra_err  = mb_stats->ref[INTRA_FRAME ].err;
@@ -387,9 +344,9 @@
         int golden_err = mb_stats->ref[GOLDEN_FRAME].err;
 
         // Test for altref vs intra and gf and that its mv was 0,0.
-        if ((altref_err > 1000) ||
-            (altref_err > intra_err) ||
-            (altref_err > golden_err)) {
+        if (altref_err > 1000 ||
+            altref_err > intra_err ||
+            altref_err > golden_err) {
           arf_not_zz[offset + mb_col]++;
         }
       }
@@ -404,10 +361,16 @@
       // goes in segment 0
       if (arf_not_zz[offset + mb_col]) {
         ncnt[0]++;
-        cpi->segmentation_map[offset + mb_col] = 0;
+        cpi->segmentation_map[offset * 4 + 2 * mb_col] = 0;
+        cpi->segmentation_map[offset * 4 + 2 * mb_col + 1] = 0;
+        cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols] = 0;
+        cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols + 1] = 0;
       } else {
+        cpi->segmentation_map[offset * 4 + 2 * mb_col] = 1;
+        cpi->segmentation_map[offset * 4 + 2 * mb_col + 1] = 1;
+        cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols] = 1;
+        cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols + 1] = 1;
         ncnt[1]++;
-        cpi->segmentation_map[offset + mb_col] = 1;
       }
     }
   }
@@ -425,10 +388,10 @@
       cpi->static_mb_pct = 0;
 
     cpi->seg0_cnt = ncnt[0];
-    vp9_enable_segmentation((VP9_PTR) cpi);
+    vp9_enable_segmentation((VP9_PTR)cpi);
   } else {
     cpi->static_mb_pct = 0;
-    vp9_disable_segmentation((VP9_PTR) cpi);
+    vp9_disable_segmentation((VP9_PTR)cpi);
   }
 
   // Free localy allocated storage
@@ -463,8 +426,7 @@
   // the ARF MC search backwards, to get optimal results for MV caching
   for (i = 0; i < n_frames; i++) {
     MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
-    struct lookahead_entry *q_cur =
-      vp9_lookahead_peek(cpi->lookahead, i);
+    struct lookahead_entry *q_cur = vp9_lookahead_peek(cpi->lookahead, i);
 
     assert(q_cur != NULL);
 
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -56,8 +56,9 @@
   MV v;
   v.row = mv->as_mv.row - ref->as_mv.row;
   v.col = mv->as_mv.col - ref->as_mv.col;
-  return ((mvjcost[vp9_get_mv_joint(v)] +
-           mvcost[0][v.row] + mvcost[1][v.col]) * weight) >> 7;
+  return ROUND_POWER_OF_TWO((mvjcost[vp9_get_mv_joint(&v)] +
+                             mvcost[0][v.row] +
+                             mvcost[1][v.col]) * weight, 7);
 }
 
 static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2],
@@ -66,9 +67,9 @@
     MV v;
     v.row = mv->as_mv.row - ref->as_mv.row;
     v.col = mv->as_mv.col - ref->as_mv.col;
-    return ((mvjcost[vp9_get_mv_joint(v)] +
-             mvcost[0][v.row] + mvcost[1][v.col]) *
-            error_per_bit + 4096) >> 13;
+    return ROUND_POWER_OF_TWO((mvjcost[vp9_get_mv_joint(&v)] +
+                               mvcost[0][v.row] +
+                               mvcost[1][v.col]) * error_per_bit, 13);
   }
   return 0;
 }
@@ -79,9 +80,9 @@
     MV v;
     v.row = mv->as_mv.row - ref->as_mv.row;
     v.col = mv->as_mv.col - ref->as_mv.col;
-    return ((mvjsadcost[vp9_get_mv_joint(v)] +
-             mvsadcost[0][v.row] + mvsadcost[1][v.col]) *
-            error_per_bit + 128) >> 8;
+    return ROUND_POWER_OF_TWO((mvjsadcost[vp9_get_mv_joint(&v)] +
+                               mvsadcost[0][v.row] +
+                               mvsadcost[1][v.col]) * error_per_bit, 8);
   }
   return 0;
 }
@@ -222,7 +223,7 @@
 
 /* returns subpixel variance error function */
 #define DIST(r, c) \
-    vfp->svf(PRE(r, c), y_stride, SP(c), SP(r), z, b->src_stride, &sse)
+    vfp->svf(PRE(r, c), y_stride, SP(c), SP(r), z, src_stride, &sse)
 
 /* checks if (r, c) has better score than previous best */
 #define CHECK_BETTER(v, r, c) \
@@ -238,7 +239,7 @@
     },                                                                   \
     v = INT_MAX;)
 
-int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,
                                              int_mv *bestmv, int_mv *ref_mv,
                                              int error_per_bit,
                                              const vp9_variance_fn_ptr_t *vfp,
@@ -245,7 +246,8 @@
                                              int *mvjcost, int *mvcost[2],
                                              int *distortion,
                                              unsigned int *sse1) {
-  uint8_t *z = (*(b->base_src) + b->src);
+  uint8_t *z = x->plane[0].src.buf;
+  int src_stride = x->plane[0].src.stride;
   MACROBLOCKD *xd = &x->e_mbd;
 
   int rr, rc, br, bc, hstep;
@@ -263,10 +265,12 @@
   int offset;
   int usehp = xd->allow_high_precision_mv;
 
-  uint8_t *y = *(d->base_pre) + d->pre +
-               (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
-  y_stride = d->pre_stride;
+  uint8_t *y = xd->plane[0].pre[0].buf +
+               (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
+               bestmv->as_mv.col;
 
+  y_stride = xd->plane[0].pre[0].stride;
+
   rr = ref_mv->as_mv.row;
   rc = ref_mv->as_mv.col;
   br = bestmv->as_mv.row << 3;
@@ -288,7 +292,7 @@
   bestmv->as_mv.col <<= 3;
 
   // calculate central point error
-  besterr = vfp->vf(y, y_stride, z, b->src_stride, sse1);
+  besterr = vfp->vf(y, y_stride, z, src_stride, sse1);
   *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost,
                          error_per_bit, xd->allow_high_precision_mv);
@@ -409,6 +413,200 @@
 
   return besterr;
 }
+
+#undef DIST
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+    vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \
+              z, src_stride, &sse, second_pred)
+
+int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
+                                 int_mv *bestmv, int_mv *ref_mv,
+                                 int error_per_bit,
+                                 const vp9_variance_fn_ptr_t *vfp,
+                                 int *mvjcost, int *mvcost[2],
+                                 int *distortion,
+                                 unsigned int *sse1,
+                                 const uint8_t *second_pred, int w, int h) {
+  uint8_t *z = x->plane[0].src.buf;
+  int src_stride = x->plane[0].src.stride;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  int rr, rc, br, bc, hstep;
+  int tr, tc;
+  unsigned int besterr = INT_MAX;
+  unsigned int left, right, up, down, diag;
+  unsigned int sse;
+  unsigned int whichdir;
+  unsigned int halfiters = 4;
+  unsigned int quarteriters = 4;
+  unsigned int eighthiters = 4;
+  int thismse;
+  int maxc, minc, maxr, minr;
+  int y_stride;
+  int offset;
+  int usehp = xd->allow_high_precision_mv;
+
+  uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
+  uint8_t *y = xd->plane[0].pre[0].buf +
+               (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
+               bestmv->as_mv.col;
+
+  y_stride = xd->plane[0].pre[0].stride;
+
+  rr = ref_mv->as_mv.row;
+  rc = ref_mv->as_mv.col;
+  br = bestmv->as_mv.row << 3;
+  bc = bestmv->as_mv.col << 3;
+  hstep = 4;
+  minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) -
+             ((1 << MV_MAX_BITS) - 1));
+  maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) +
+             ((1 << MV_MAX_BITS) - 1));
+  minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) -
+             ((1 << MV_MAX_BITS) - 1));
+  maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) +
+             ((1 << MV_MAX_BITS) - 1));
+
+  tr = br;
+  tc = bc;
+
+
+  offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+
+  // central mv
+  bestmv->as_mv.row <<= 3;
+  bestmv->as_mv.col <<= 3;
+
+  // calculate central point error
+  // TODO(yunqingwang): central pointer error was already calculated in full-
+  // pixel search, and can be passed in this function.
+  comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+  besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost,
+                         error_per_bit, xd->allow_high_precision_mv);
+
+  // Each subsequent iteration checks at least one point in
+  // common with the last iteration could be 2 ( if diag selected)
+  while (--halfiters) {
+    // 1/2 pel
+    CHECK_BETTER(left, tr, tc - hstep);
+    CHECK_BETTER(right, tr, tc + hstep);
+    CHECK_BETTER(up, tr - hstep, tc);
+    CHECK_BETTER(down, tr + hstep, tc);
+
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+    }
+
+    // no reason to check the same one again.
+    if (tr == br && tc == bc)
+      break;
+
+    tr = br;
+    tc = bc;
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+  hstep >>= 1;
+  while (--quarteriters) {
+    CHECK_BETTER(left, tr, tc - hstep);
+    CHECK_BETTER(right, tr, tc + hstep);
+    CHECK_BETTER(up, tr - hstep, tc);
+    CHECK_BETTER(down, tr + hstep, tc);
+
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+    }
+
+    // no reason to check the same one again.
+    if (tr == br && tc == bc)
+      break;
+
+    tr = br;
+    tc = bc;
+  }
+
+  if (xd->allow_high_precision_mv) {
+    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+  } else {
+    usehp = 0;
+  }
+
+  if (usehp) {
+    hstep >>= 1;
+    while (--eighthiters) {
+      CHECK_BETTER(left, tr, tc - hstep);
+      CHECK_BETTER(right, tr, tc + hstep);
+      CHECK_BETTER(up, tr - hstep, tc);
+      CHECK_BETTER(down, tr + hstep, tc);
+
+      whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+      switch (whichdir) {
+        case 0:
+          CHECK_BETTER(diag, tr - hstep, tc - hstep);
+          break;
+        case 1:
+          CHECK_BETTER(diag, tr - hstep, tc + hstep);
+          break;
+        case 2:
+          CHECK_BETTER(diag, tr + hstep, tc - hstep);
+          break;
+        case 3:
+          CHECK_BETTER(diag, tr + hstep, tc + hstep);
+          break;
+      }
+
+      // no reason to check the same one again.
+      if (tr == br && tc == bc)
+        break;
+
+      tr = br;
+      tc = bc;
+    }
+  }
+  bestmv->as_mv.row = br;
+  bestmv->as_mv.col = bc;
+
+  vpx_free(comp_pred);
+
+  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+
 #undef MVC
 #undef PRE
 #undef DIST
@@ -417,7 +615,7 @@
 #undef MIN
 #undef MAX
 
-int vp9_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
                                  int_mv *bestmv, int_mv *ref_mv,
                                  int error_per_bit,
                                  const vp9_variance_fn_ptr_t *vfp,
@@ -428,7 +626,8 @@
   int_mv this_mv;
   int_mv orig_mv;
   int yrow_movedback = 0, ycol_movedback = 0;
-  uint8_t *z = (*(b->base_src) + b->src);
+  uint8_t *z = x->plane[0].src.buf;
+  int src_stride = x->plane[0].src.stride;
   int left, right, up, down, diag;
   unsigned int sse;
   int whichdir;
@@ -437,9 +636,10 @@
   MACROBLOCKD *xd = &x->e_mbd;
   int usehp = xd->allow_high_precision_mv;
 
-  uint8_t *y = *(d->base_pre) + d->pre +
-               (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
-  y_stride = d->pre_stride;
+  uint8_t *y = xd->plane[0].pre[0].buf +
+               (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
+               bestmv->as_mv.col;
+  y_stride = xd->plane[0].pre[0].stride;
 
   // central mv
   bestmv->as_mv.row <<= 3;
@@ -448,7 +648,7 @@
   orig_mv = *bestmv;
 
   // calculate central point error
-  bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);
+  bestmse = vfp->vf(y, y_stride, z, src_stride, sse1);
   *distortion = bestmse;
   bestmse += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit,
                          xd->allow_high_precision_mv);
@@ -456,7 +656,7 @@
   // go left then right and check error
   this_mv.as_mv.row = startmv.as_mv.row;
   this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
-  thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
+  thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, src_stride, &sse);
   left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
                                xd->allow_high_precision_mv);
 
@@ -468,7 +668,7 @@
   }
 
   this_mv.as_mv.col += 8;
-  thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
+  thismse = vfp->svf_halfpix_h(y, y_stride, z, src_stride, &sse);
   right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
                                 error_per_bit, xd->allow_high_precision_mv);
 
@@ -482,7 +682,7 @@
   // go up then down and check error
   this_mv.as_mv.col = startmv.as_mv.col;
   this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
-  thismse =  vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
+  thismse =  vfp->svf_halfpix_v(y - y_stride, y_stride, z, src_stride, &sse);
   up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
                              xd->allow_high_precision_mv);
 
@@ -494,7 +694,7 @@
   }
 
   this_mv.as_mv.row += 8;
-  thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
+  thismse = vfp->svf_halfpix_v(y, y_stride, z, src_stride, &sse);
   down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
                                xd->allow_high_precision_mv);
 
@@ -516,23 +716,25 @@
     case 0:
       this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
       this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
-      thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);
+      thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, src_stride,
+                                    &sse);
       break;
     case 1:
       this_mv.as_mv.col += 4;
       this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
-      thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
+      thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, src_stride,
+                                    &sse);
       break;
     case 2:
       this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
       this_mv.as_mv.row += 4;
-      thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
+      thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, src_stride, &sse);
       break;
     case 3:
     default:
       this_mv.as_mv.col += 4;
       this_mv.as_mv.row += 4;
-      thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
+      thismse = vfp->svf_halfpix_hv(y, y_stride, z, src_stride, &sse);
       break;
   }
 
@@ -571,11 +773,11 @@
     this_mv.as_mv.col = startmv.as_mv.col - 2;
     thismse = vfp->svf(y, y_stride,
                        SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                       z, b->src_stride, &sse);
+                       z, src_stride, &sse);
   } else {
     this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
     thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z,
-                       b->src_stride, &sse);
+                       src_stride, &sse);
   }
 
   left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
@@ -591,7 +793,7 @@
   this_mv.as_mv.col += 4;
   thismse = vfp->svf(y, y_stride,
                      SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                     z, b->src_stride, &sse);
+                     z, src_stride, &sse);
   right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
                                 error_per_bit, xd->allow_high_precision_mv);
 
@@ -609,11 +811,11 @@
     this_mv.as_mv.row = startmv.as_mv.row - 2;
     thismse = vfp->svf(y, y_stride,
                        SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                       z, b->src_stride, &sse);
+                       z, src_stride, &sse);
   } else {
     this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
     thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6),
-                       z, b->src_stride, &sse);
+                       z, src_stride, &sse);
   }
 
   up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
@@ -628,7 +830,7 @@
 
   this_mv.as_mv.row += 4;
   thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                     z, b->src_stride, &sse);
+                     z, src_stride, &sse);
   down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
                                xd->allow_high_precision_mv);
 
@@ -655,10 +857,13 @@
 
         if (startmv.as_mv.col & 7) {
           this_mv.as_mv.col -= 2;
-          thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+          thismse = vfp->svf(y, y_stride,
+                             SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+                             z, src_stride, &sse);
         } else {
           this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
-          thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z, b->src_stride, &sse);;
+          thismse = vfp->svf(y - 1, y_stride,
+                             SP(6), SP(this_mv.as_mv.row), z, src_stride, &sse);
         }
       } else {
         this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
@@ -665,10 +870,12 @@
 
         if (startmv.as_mv.col & 7) {
           this_mv.as_mv.col -= 2;
-          thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6), z, b->src_stride, &sse);
+          thismse = vfp->svf(y - y_stride, y_stride,
+                             SP(this_mv.as_mv.col), SP(6), z, src_stride, &sse);
         } else {
           this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
-          thismse = vfp->svf(y - y_stride - 1, y_stride, SP(6), SP(6), z, b->src_stride, &sse);
+          thismse = vfp->svf(y - y_stride - 1, y_stride,
+                             SP(6), SP(6), z, src_stride, &sse);
         }
       }
 
@@ -678,10 +885,13 @@
 
       if (startmv.as_mv.row & 7) {
         this_mv.as_mv.row -= 2;
-        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+        thismse = vfp->svf(y, y_stride,
+                           SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+                           z, src_stride, &sse);
       } else {
         this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
-        thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6), z, b->src_stride, &sse);
+        thismse = vfp->svf(y - y_stride, y_stride,
+                           SP(this_mv.as_mv.col), SP(6), z, src_stride, &sse);
       }
 
       break;
@@ -690,12 +900,13 @@
 
       if (startmv.as_mv.col & 7) {
         this_mv.as_mv.col -= 2;
-        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                           z, b->src_stride, &sse);
+        thismse = vfp->svf(y, y_stride,
+                           SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+                           z, src_stride, &sse);
       } else {
         this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
         thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z,
-                           b->src_stride, &sse);
+                           src_stride, &sse);
       }
 
       break;
@@ -704,7 +915,7 @@
       this_mv.as_mv.row += 2;
       thismse = vfp->svf(y, y_stride,
                          SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                         z, b->src_stride, &sse);
+                         z, src_stride, &sse);
       break;
   }
 
@@ -746,11 +957,11 @@
     this_mv.as_mv.col = startmv.as_mv.col - 1;
     thismse = vfp->svf(y, y_stride,
                        SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                       z, b->src_stride, &sse);
+                       z, src_stride, &sse);
   } else {
     this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
     thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row),
-                       z, b->src_stride, &sse);
+                       z, src_stride, &sse);
   }
 
   left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
@@ -765,7 +976,7 @@
 
   this_mv.as_mv.col += 2;
   thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
-                     z, b->src_stride, &sse);
+                     z, src_stride, &sse);
   right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
                                 error_per_bit, xd->allow_high_precision_mv);
 
@@ -781,10 +992,13 @@
 
   if (startmv.as_mv.row & 7) {
     this_mv.as_mv.row = startmv.as_mv.row - 1;
-    thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+    thismse = vfp->svf(y, y_stride,
+                       SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+                       z, src_stride, &sse);
   } else {
     this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
-    thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);
+    thismse = vfp->svf(y - y_stride, y_stride,
+                       SP(this_mv.as_mv.col), SP(7), z, src_stride, &sse);
   }
 
   up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
@@ -798,7 +1012,9 @@
   }
 
   this_mv.as_mv.row += 2;
-  thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+  thismse = vfp->svf(y, y_stride,
+                     SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+                     z, src_stride, &sse);
   down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
                                xd->allow_high_precision_mv);
 
@@ -824,10 +1040,14 @@
 
         if (startmv.as_mv.col & 7) {
           this_mv.as_mv.col -= 1;
-          thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+          thismse = vfp->svf(y, y_stride,
+                             SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+                             z, src_stride, &sse);
         } else {
           this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
-          thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row), z, b->src_stride, &sse);;
+          thismse = vfp->svf(y - 1, y_stride,
+                             SP(7), SP(this_mv.as_mv.row),
+                             z, src_stride, &sse);
         }
       } else {
         this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
@@ -834,10 +1054,12 @@
 
         if (startmv.as_mv.col & 7) {
           this_mv.as_mv.col -= 1;
-          thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);
+          thismse = vfp->svf(y - y_stride, y_stride,
+                             SP(this_mv.as_mv.col), SP(7), z, src_stride, &sse);
         } else {
           this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
-          thismse = vfp->svf(y - y_stride - 1, y_stride, SP(7), SP(7), z, b->src_stride, &sse);
+          thismse = vfp->svf(y - y_stride - 1, y_stride,
+                             SP(7), SP(7), z, src_stride, &sse);
         }
       }
 
@@ -847,10 +1069,13 @@
 
       if (startmv.as_mv.row & 7) {
         this_mv.as_mv.row -= 1;
-        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+        thismse = vfp->svf(y, y_stride,
+                           SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+                           z, src_stride, &sse);
       } else {
         this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7;
-        thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(7), z, b->src_stride, &sse);
+        thismse = vfp->svf(y - y_stride, y_stride,
+                           SP(this_mv.as_mv.col), SP(7), z, src_stride, &sse);
       }
 
       break;
@@ -859,10 +1084,13 @@
 
       if (startmv.as_mv.col & 7) {
         this_mv.as_mv.col -= 1;
-        thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+        thismse = vfp->svf(y, y_stride,
+                           SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+                           z, src_stride, &sse);
       } else {
         this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7;
-        thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+        thismse = vfp->svf(y - 1, y_stride,
+                           SP(7), SP(this_mv.as_mv.row), z, src_stride, &sse);
       }
 
       break;
@@ -869,7 +1097,9 @@
     case 3:
       this_mv.as_mv.col += 1;
       this_mv.as_mv.row += 1;
-      thismse = vfp->svf(y, y_stride,  SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), z, b->src_stride, &sse);
+      thismse = vfp->svf(y, y_stride,
+                         SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
+                         z, src_stride, &sse);
       break;
   }
 
@@ -888,7 +1118,7 @@
 
 #undef SP
 
-int vp9_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+int vp9_find_best_half_pixel_step(MACROBLOCK *x,
                                   int_mv *bestmv, int_mv *ref_mv,
                                   int error_per_bit,
                                   const vp9_variance_fn_ptr_t *vfp,
@@ -898,7 +1128,8 @@
   int bestmse = INT_MAX;
   int_mv startmv;
   int_mv this_mv;
-  uint8_t *z = (*(b->base_src) + b->src);
+  uint8_t *z = x->plane[0].src.buf;
+  int src_stride = x->plane[0].src.stride;
   int left, right, up, down, diag;
   unsigned int sse;
   int whichdir;
@@ -906,9 +1137,9 @@
   int y_stride;
   MACROBLOCKD *xd = &x->e_mbd;
 
-  uint8_t *y = *(d->base_pre) + d->pre +
-      (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
-  y_stride = d->pre_stride;
+  uint8_t *y = xd->plane[0].pre[0].buf +
+      (bestmv->as_mv.row) * xd->plane[0].pre[0].stride + bestmv->as_mv.col;
+  y_stride = xd->plane[0].pre[0].stride;
 
   // central mv
   bestmv->as_mv.row <<= 3;
@@ -916,7 +1147,7 @@
   startmv = *bestmv;
 
   // calculate central point error
-  bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);
+  bestmse = vfp->vf(y, y_stride, z, src_stride, sse1);
   *distortion = bestmse;
   bestmse += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit,
                          xd->allow_high_precision_mv);
@@ -924,7 +1155,7 @@
   // go left then right and check error
   this_mv.as_mv.row = startmv.as_mv.row;
   this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
-  thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
+  thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, src_stride, &sse);
   left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
                                xd->allow_high_precision_mv);
 
@@ -936,7 +1167,7 @@
   }
 
   this_mv.as_mv.col += 8;
-  thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
+  thismse = vfp->svf_halfpix_h(y, y_stride, z, src_stride, &sse);
   right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
                                 error_per_bit, xd->allow_high_precision_mv);
 
@@ -950,7 +1181,7 @@
   // go up then down and check error
   this_mv.as_mv.col = startmv.as_mv.col;
   this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
-  thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
+  thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, src_stride, &sse);
   up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
                              xd->allow_high_precision_mv);
 
@@ -962,7 +1193,7 @@
   }
 
   this_mv.as_mv.row += 8;
-  thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
+  thismse = vfp->svf_halfpix_v(y, y_stride, z, src_stride, &sse);
   down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
                                xd->allow_high_precision_mv);
 
@@ -981,23 +1212,25 @@
     case 0:
       this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
       this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
-      thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);
+      thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride,
+                                    z, src_stride, &sse);
       break;
     case 1:
       this_mv.as_mv.col += 4;
       this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
-      thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
+      thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride,
+                                    z, src_stride, &sse);
       break;
     case 2:
       this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
       this_mv.as_mv.row += 4;
-      thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
+      thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, src_stride, &sse);
       break;
     case 3:
     default:
       this_mv.as_mv.col += 4;
       this_mv.as_mv.row += 4;
-      thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
+      thismse = vfp->svf_halfpix_hv(y, y_stride, z, src_stride, &sse);
       break;
   }
 
@@ -1057,8 +1290,6 @@
 int vp9_hex_search
 (
   MACROBLOCK *x,
-  BLOCK *b,
-  BLOCKD *d,
   int_mv *ref_mv,
   int_mv *best_mv,
   int search_param,
@@ -1068,13 +1299,14 @@
   int *mvjcost, int *mvcost[2],
   int_mv *center_mv
 ) {
+  const MACROBLOCKD* const xd = &x->e_mbd;
   MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} };
   MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}};
   int i, j;
 
-  uint8_t *what = (*(b->base_src) + b->src);
-  int what_stride = b->src_stride;
-  int in_what_stride = d->pre_stride;
+  uint8_t *what = x->plane[0].src.buf;
+  int what_stride = x->plane[0].src.stride;
+  int in_what_stride = xd->plane[0].pre[0].stride;
   int br, bc;
   int_mv this_mv;
   unsigned int bestsad = 0x7fffffff;
@@ -1095,8 +1327,8 @@
   bc = ref_mv->as_mv.col;
 
   // Work out the start point for the search
-  base_offset = (uint8_t *)(*(d->base_pre) + d->pre);
-  this_offset = base_offset + (br * (d->pre_stride)) + bc;
+  base_offset = (uint8_t *)(xd->plane[0].pre[0].buf);
+  this_offset = base_offset + (br * (xd->plane[0].pre[0].stride)) + bc;
   this_mv.as_mv.row = br;
   this_mv.as_mv.col = bc;
   bestsad = vfp->sdf(what, what_stride, this_offset,
@@ -1211,7 +1443,7 @@
 #undef CHECK_POINT
 #undef CHECK_BETTER
 
-int vp9_diamond_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+int vp9_diamond_search_sad_c(MACROBLOCK *x,
                              int_mv *ref_mv, int_mv *best_mv,
                              int search_param, int sad_per_bit, int *num00,
                              vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,
@@ -1218,10 +1450,11 @@
                              int *mvcost[2], int_mv *center_mv) {
   int i, j, step;
 
-  uint8_t *what = (*(b->base_src) + b->src);
-  int what_stride = b->src_stride;
+  const MACROBLOCKD* const xd = &x->e_mbd;
+  uint8_t *what = x->plane[0].src.buf;
+  int what_stride = x->plane[0].src.stride;
   uint8_t *in_what;
-  int in_what_stride = d->pre_stride;
+  int in_what_stride = xd->plane[0].pre[0].stride;
   uint8_t *best_address;
 
   int tot_steps;
@@ -1237,7 +1470,6 @@
 
   uint8_t *check_here;
   int thissad;
-  MACROBLOCKD *xd = &x->e_mbd;
   int_mv fcenter_mv;
 
   int *mvjsadcost = x->nmvjointsadcost;
@@ -1254,8 +1486,8 @@
   best_mv->as_mv.col = ref_col;
 
   // Work out the start point for the search
-  in_what = (uint8_t *)(*(d->base_pre) + d->pre +
-                        (ref_row * (d->pre_stride)) + ref_col);
+  in_what = (uint8_t *)(xd->plane[0].pre[0].buf +
+                        (ref_row * (xd->plane[0].pre[0].stride)) + ref_col);
   best_address = in_what;
 
   // Check the starting position
@@ -1322,7 +1554,7 @@
                   xd->allow_high_precision_mv);
 }
 
-int vp9_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+int vp9_diamond_search_sadx4(MACROBLOCK *x,
                              int_mv *ref_mv, int_mv *best_mv, int search_param,
                              int sad_per_bit, int *num00,
                              vp9_variance_fn_ptr_t *fn_ptr,
@@ -1329,10 +1561,11 @@
                              int *mvjcost, int *mvcost[2], int_mv *center_mv) {
   int i, j, step;
 
-  uint8_t *what = (*(b->base_src) + b->src);
-  int what_stride = b->src_stride;
+  const MACROBLOCKD* const xd = &x->e_mbd;
+  uint8_t *what = x->plane[0].src.buf;
+  int what_stride = x->plane[0].src.stride;
   uint8_t *in_what;
-  int in_what_stride = d->pre_stride;
+  int in_what_stride = xd->plane[0].pre[0].stride;
   uint8_t *best_address;
 
   int tot_steps;
@@ -1350,7 +1583,6 @@
 
   uint8_t *check_here;
   unsigned int thissad;
-  MACROBLOCKD *xd = &x->e_mbd;
   int_mv fcenter_mv;
 
   int *mvjsadcost = x->nmvjointsadcost;
@@ -1367,8 +1599,8 @@
   best_mv->as_mv.col = ref_col;
 
   // Work out the start point for the search
-  in_what = (uint8_t *)(*(d->base_pre) + d->pre +
-                        (ref_row * (d->pre_stride)) + ref_col);
+  in_what = (uint8_t *)(xd->plane[0].pre[0].buf +
+                        (ref_row * (xd->plane[0].pre[0].stride)) + ref_col);
   best_address = in_what;
 
   // Check the starting position
@@ -1472,14 +1704,14 @@
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
               point as the best match, we will do a final 1-away diamond
               refining search  */
-int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b,
-                           BLOCKD *d, int_mv *mvp_full, int step_param,
+int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x,
+                           int_mv *mvp_full, int step_param,
                            int sadpb, int further_steps,
                            int do_refine, vp9_variance_fn_ptr_t *fn_ptr,
                            int_mv *ref_mv, int_mv *dst_mv) {
   int_mv temp_mv;
   int thissme, n, num00;
-  int bestsme = cpi->diamond_search_sad(x, b, d, mvp_full, &temp_mv,
+  int bestsme = cpi->diamond_search_sad(x, mvp_full, &temp_mv,
                                         step_param, sadpb, &num00,
                                         fn_ptr, x->nmvjointcost,
                                         x->mvcost, ref_mv);
@@ -1498,7 +1730,7 @@
     if (num00)
       num00--;
     else {
-      thissme = cpi->diamond_search_sad(x, b, d, mvp_full, &temp_mv,
+      thissme = cpi->diamond_search_sad(x, mvp_full, &temp_mv,
                                         step_param + n, sadpb, &num00,
                                         fn_ptr, x->nmvjointcost, x->mvcost,
                                         ref_mv);
@@ -1519,7 +1751,7 @@
     int search_range = 8;
     int_mv best_mv;
     best_mv.as_int = dst_mv->as_int;
-    thissme = cpi->refining_search_sad(x, b, d, &best_mv, sadpb, search_range,
+    thissme = cpi->refining_search_sad(x, &best_mv, sadpb, search_range,
                                        fn_ptr, x->nmvjointcost, x->mvcost,
                                        ref_mv);
 
@@ -1531,18 +1763,19 @@
   return bestsme;
 }
 
-int vp9_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
                           int sad_per_bit, int distance,
                           vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,
                           int *mvcost[2],
-                          int_mv *center_mv) {
-  uint8_t *what = (*(b->base_src) + b->src);
-  int what_stride = b->src_stride;
+                          int_mv *center_mv, int n) {
+  const MACROBLOCKD* const xd = &x->e_mbd;
+  uint8_t *what = x->plane[0].src.buf;
+  int what_stride = x->plane[0].src.stride;
   uint8_t *in_what;
-  int in_what_stride = d->pre_stride;
-  int mv_stride = d->pre_stride;
+  int in_what_stride = xd->plane[0].pre[0].stride;
+  int mv_stride = xd->plane[0].pre[0].stride;
   uint8_t *bestaddress;
-  int_mv *best_mv = &d->bmi.as_mv[0];
+  int_mv *best_mv = &x->e_mbd.mode_info_context->bmi[n].as_mv[0];
   int_mv this_mv;
   int bestsad = INT_MAX;
   int r, c;
@@ -1549,7 +1782,6 @@
 
   uint8_t *check_here;
   int thissad;
-  MACROBLOCKD *xd = &x->e_mbd;
 
   int ref_row = ref_mv->as_mv.row;
   int ref_col = ref_mv->as_mv.col;
@@ -1567,8 +1799,8 @@
   fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
   // Work out the mid point for the search
-  in_what = *(d->base_pre) + d->pre;
-  bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+  in_what = xd->plane[0].pre[0].buf;
+  bestaddress = in_what + (ref_row * xd->plane[0].pre[0].stride) + ref_col;
 
   best_mv->as_mv.row = ref_row;
   best_mv->as_mv.col = ref_col;
@@ -1627,17 +1859,18 @@
     return INT_MAX;
 }
 
-int vp9_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
                           int sad_per_bit, int distance,
                           vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,
-                          int *mvcost[2], int_mv *center_mv) {
-  uint8_t *what = (*(b->base_src) + b->src);
-  int what_stride = b->src_stride;
+                          int *mvcost[2], int_mv *center_mv, int n) {
+  const MACROBLOCKD* const xd = &x->e_mbd;
+  uint8_t *what = x->plane[0].src.buf;
+  int what_stride = x->plane[0].src.stride;
   uint8_t *in_what;
-  int in_what_stride = d->pre_stride;
-  int mv_stride = d->pre_stride;
+  int in_what_stride = xd->plane[0].pre[0].stride;
+  int mv_stride = xd->plane[0].pre[0].stride;
   uint8_t *bestaddress;
-  int_mv *best_mv = &d->bmi.as_mv[0];
+  int_mv *best_mv = &x->e_mbd.mode_info_context->bmi[n].as_mv[0];
   int_mv this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
@@ -1644,7 +1877,6 @@
 
   uint8_t *check_here;
   unsigned int thissad;
-  MACROBLOCKD *xd = &x->e_mbd;
 
   int ref_row = ref_mv->as_mv.row;
   int ref_col = ref_mv->as_mv.col;
@@ -1664,8 +1896,8 @@
   fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
   // Work out the mid point for the search
-  in_what = *(d->base_pre) + d->pre;
-  bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+  in_what = xd->plane[0].pre[0].buf;
+  bestaddress = in_what + (ref_row * xd->plane[0].pre[0].stride) + ref_col;
 
   best_mv->as_mv.row = ref_row;
   best_mv->as_mv.col = ref_col;
@@ -1755,18 +1987,19 @@
     return INT_MAX;
 }
 
-int vp9_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
                           int sad_per_bit, int distance,
                           vp9_variance_fn_ptr_t *fn_ptr,
                           int *mvjcost, int *mvcost[2],
-                          int_mv *center_mv) {
-  uint8_t *what = (*(b->base_src) + b->src);
-  int what_stride = b->src_stride;
+                          int_mv *center_mv, int n) {
+  const MACROBLOCKD* const xd = &x->e_mbd;
+  uint8_t *what = x->plane[0].src.buf;
+  int what_stride = x->plane[0].src.stride;
   uint8_t *in_what;
-  int in_what_stride = d->pre_stride;
-  int mv_stride = d->pre_stride;
+  int in_what_stride = xd->plane[0].pre[0].stride;
+  int mv_stride = xd->plane[0].pre[0].stride;
   uint8_t *bestaddress;
-  int_mv *best_mv = &d->bmi.as_mv[0];
+  int_mv *best_mv = &x->e_mbd.mode_info_context->bmi[n].as_mv[0];
   int_mv this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
@@ -1773,7 +2006,6 @@
 
   uint8_t *check_here;
   unsigned int thissad;
-  MACROBLOCKD *xd = &x->e_mbd;
 
   int ref_row = ref_mv->as_mv.row;
   int ref_col = ref_mv->as_mv.col;
@@ -1794,8 +2026,8 @@
   fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
   // Work out the mid point for the search
-  in_what = *(d->base_pre) + d->pre;
-  bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+  in_what = xd->plane[0].pre[0].buf;
+  bestaddress = in_what + (ref_row * xd->plane[0].pre[0].stride) + ref_col;
 
   best_mv->as_mv.row = ref_row;
   best_mv->as_mv.col = ref_col;
@@ -1909,25 +2141,25 @@
   else
     return INT_MAX;
 }
-int vp9_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+int vp9_refining_search_sad_c(MACROBLOCK *x,
                               int_mv *ref_mv, int error_per_bit,
                               int search_range, vp9_variance_fn_ptr_t *fn_ptr,
                               int *mvjcost, int *mvcost[2], int_mv *center_mv) {
+  const MACROBLOCKD* const xd = &x->e_mbd;
   MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
   int i, j;
   int this_row_offset, this_col_offset;
 
-  int what_stride = b->src_stride;
-  int in_what_stride = d->pre_stride;
-  uint8_t *what = (*(b->base_src) + b->src);
-  uint8_t *best_address = (uint8_t *)(*(d->base_pre) + d->pre +
-                                      (ref_mv->as_mv.row * (d->pre_stride)) +
-                                      ref_mv->as_mv.col);
+  int what_stride = x->plane[0].src.stride;
+  int in_what_stride = xd->plane[0].pre[0].stride;
+  uint8_t *what = x->plane[0].src.buf;
+  uint8_t *best_address = xd->plane[0].pre[0].buf +
+                          (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) +
+                          ref_mv->as_mv.col;
   uint8_t *check_here;
   unsigned int thissad;
   int_mv this_mv;
   unsigned int bestsad = INT_MAX;
-  MACROBLOCKD *xd = &x->e_mbd;
   int_mv fcenter_mv;
 
   int *mvjsadcost = x->nmvjointsadcost;
@@ -1987,25 +2219,25 @@
     return INT_MAX;
 }
 
-int vp9_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+int vp9_refining_search_sadx4(MACROBLOCK *x,
                               int_mv *ref_mv, int error_per_bit,
                               int search_range, vp9_variance_fn_ptr_t *fn_ptr,
                               int *mvjcost, int *mvcost[2], int_mv *center_mv) {
+  const MACROBLOCKD* const xd = &x->e_mbd;
   MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
   int i, j;
   int this_row_offset, this_col_offset;
 
-  int what_stride = b->src_stride;
-  int in_what_stride = d->pre_stride;
-  uint8_t *what = (*(b->base_src) + b->src);
-  uint8_t *best_address = (uint8_t *)(*(d->base_pre) + d->pre +
-                                      (ref_mv->as_mv.row * (d->pre_stride)) +
-                                      ref_mv->as_mv.col);
+  int what_stride = x->plane[0].src.stride;
+  int in_what_stride = xd->plane[0].pre[0].stride;
+  uint8_t *what = x->plane[0].src.buf;
+  uint8_t *best_address = xd->plane[0].pre[0].buf +
+                          (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) +
+                          ref_mv->as_mv.col;
   uint8_t *check_here;
   unsigned int thissad;
   int_mv this_mv;
   unsigned int bestsad = INT_MAX;
-  MACROBLOCKD *xd = &x->e_mbd;
   int_mv fcenter_mv;
 
   int *mvjsadcost = x->nmvjointsadcost;
@@ -2094,33 +2326,104 @@
     return INT_MAX;
 }
 
+/* This function is called when we do joint motion search in comp_inter_inter
+ * mode.
+ */
+int vp9_refining_search_8p_c(MACROBLOCK *x,
+                             int_mv *ref_mv, int error_per_bit,
+                             int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+                             int *mvjcost, int *mvcost[2], int_mv *center_mv,
+                             const uint8_t *second_pred, int w, int h) {
+  const MACROBLOCKD* const xd = &x->e_mbd;
+  MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
+      {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
+  int i, j;
+  int this_row_offset, this_col_offset;
 
+  int what_stride = x->plane[0].src.stride;
+  int in_what_stride = xd->plane[0].pre[0].stride;
+  uint8_t *what = x->plane[0].src.buf;
+  uint8_t *best_address = xd->plane[0].pre[0].buf +
+                          (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) +
+                          ref_mv->as_mv.col;
+  uint8_t *check_here;
+  unsigned int thissad;
+  int_mv this_mv;
+  unsigned int bestsad = INT_MAX;
+  int_mv fcenter_mv;
 
-#ifdef ENTROPY_STATS
-void print_mode_context(VP9_COMMON *pc) {
-  FILE *f = fopen("vp9_modecont.c", "a");
-  int i, j;
+  int *mvjsadcost = x->nmvjointsadcost;
+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
-  fprintf(f, "#include \"vp9_entropy.h\"\n");
-  fprintf(f, "const int vp9_mode_contexts[INTER_MODE_CONTEXTS][4] =");
-  fprintf(f, "{\n");
-  for (j = 0; j < INTER_MODE_CONTEXTS; j++) {
-    fprintf(f, "  {/* %d */ ", j);
-    fprintf(f, "    ");
-    for (i = 0; i < 4; i++) {
-      int this_prob;
+  /* Compound pred buffer */
+  uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
 
-      // context probs
-      this_prob = get_binary_prob(pc->fc.mv_ref_ct[j][i][0],
-                                  pc->fc.mv_ref_ct[j][i][1]);
+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
-      fprintf(f, "%5d, ", this_prob);
+  /* Get compound pred by averaging two pred blocks. */
+  comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
+
+  bestsad = fn_ptr->sdf(what, what_stride, comp_pred, w, 0x7fffffff) +
+      mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+
+    for (j = 0; j < 8; j++) {
+      this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
+      this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+
+      if ((this_col_offset > x->mv_col_min) &&
+          (this_col_offset < x->mv_col_max) &&
+          (this_row_offset > x->mv_row_min) &&
+          (this_row_offset < x->mv_row_max)) {
+        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
+            best_address;
+
+        /* Get compound block and use it to calculate SAD. */
+        comp_avg_pred(comp_pred, second_pred, w, h, check_here,
+                      in_what_stride);
+        thissad = fn_ptr->sdf(what, what_stride, comp_pred, w, bestsad);
+
+        if (thissad < bestsad) {
+          this_mv.as_mv.row = this_row_offset;
+          this_mv.as_mv.col = this_col_offset;
+          thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
+                                    mvsadcost, error_per_bit);
+
+          if (thissad < bestsad) {
+            bestsad = thissad;
+            best_site = j;
+          }
+        }
+      }
     }
-    fprintf(f, "  },\n");
+
+    if (best_site == -1) {
+      break;
+    } else {
+      ref_mv->as_mv.row += neighbors[best_site].row;
+      ref_mv->as_mv.col += neighbors[best_site].col;
+      best_address += (neighbors[best_site].row) * in_what_stride +
+          neighbors[best_site].col;
+    }
   }
 
-  fprintf(f, "};\n");
-  fclose(f);
-}
+  this_mv.as_mv.row = ref_mv->as_mv.row << 3;
+  this_mv.as_mv.col = ref_mv->as_mv.col << 3;
 
-#endif/* END MV ref count ENTROPY_STATS stats code */
+  if (bestsad < INT_MAX) {
+    int besterr;
+    comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
+    besterr = fn_ptr->vf(what, what_stride, comp_pred, w,
+        (unsigned int *)(&thissad)) +
+        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
+                    xd->allow_high_precision_mv);
+    vpx_free(comp_pred);
+    return besterr;
+  } else {
+    vpx_free(comp_pred);
+    return INT_MAX;
+  }
+}
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -15,10 +15,6 @@
 #include "vp9/encoder/vp9_block.h"
 #include "vp9/encoder/vp9_variance.h"
 
-#ifdef ENTROPY_STATS
-void print_mode_context(VP9_COMMON *pc);
-#endif
-
 // The maximum number of steps in a step search given the largest
 // allowed initial step
 #define MAX_MVSEARCH_STEPS 11
@@ -37,13 +33,13 @@
 
 // Runs sequence of diamond searches in smaller steps for RD
 struct VP9_COMP;
-int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b,
-                           BLOCKD *d, int_mv *mvp_full, int step_param,
+int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x,
+                           int_mv *mvp_full, int step_param,
                            int sadpb, int further_steps, int do_refine,
                            vp9_variance_fn_ptr_t *fn_ptr,
                            int_mv *ref_mv, int_mv *dst_mv);
 
-int vp9_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+int vp9_hex_search(MACROBLOCK *x,
                    int_mv *ref_mv, int_mv *best_mv,
                    int search_param, int error_per_bit,
                    const vp9_variance_fn_ptr_t *vf,
@@ -51,7 +47,7 @@
                    int *mvjcost, int *mvcost[2],
                    int_mv *center_mv);
 
-typedef int (fractional_mv_step_fp) (MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv
+typedef int (fractional_mv_step_fp) (MACROBLOCK *x, int_mv
   *bestmv, int_mv *ref_mv, int error_per_bit, const vp9_variance_fn_ptr_t *vfp,
   int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse);
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_step_iteratively;
@@ -58,13 +54,13 @@
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_step;
 extern fractional_mv_step_fp vp9_find_best_half_pixel_step;
 
-typedef int (*vp9_full_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+typedef int (*vp9_full_search_fn_t)(MACROBLOCK *x,
                                     int_mv *ref_mv, int sad_per_bit,
                                     int distance, vp9_variance_fn_ptr_t *fn_ptr,
                                     int *mvjcost, int *mvcost[2],
-                                    int_mv *center_mv);
+                                    int_mv *center_mv, int n);
 
-typedef int (*vp9_refining_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+typedef int (*vp9_refining_search_fn_t)(MACROBLOCK *x,
                                         int_mv *ref_mv, int sad_per_bit,
                                         int distance,
                                         vp9_variance_fn_ptr_t *fn_ptr,
@@ -71,7 +67,7 @@
                                         int *mvjcost, int *mvcost[2],
                                         int_mv *center_mv);
 
-typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x,
                                        int_mv *ref_mv, int_mv *best_mv,
                                        int search_param, int sad_per_bit,
                                        int *num00,
@@ -79,5 +75,19 @@
                                        int *mvjcost, int *mvcost[2],
                                        int_mv *center_mv);
 
+int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
+                                 int_mv *bestmv, int_mv *ref_mv,
+                                 int error_per_bit,
+                                 const vp9_variance_fn_ptr_t *vfp,
+                                 int *mvjcost, int *mvcost[2],
+                                 int *distortion, unsigned int *sse1,
+                                 const uint8_t *second_pred,
+                                 int w, int h);
 
+int vp9_refining_search_8p_c(MACROBLOCK *x,
+                             int_mv *ref_mv, int error_per_bit,
+                             int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+                             int *mvjcost, int *mvcost[2],
+                             int_mv *center_mv, const uint8_t *second_pred,
+                             int w, int h);
 #endif  // VP9_ENCODER_VP9_MCOMP_H_
--- a/vp9/encoder/vp9_modecosts.c
+++ b/vp9/encoder/vp9_modecosts.c
@@ -17,32 +17,23 @@
 
 void vp9_init_mode_costs(VP9_COMP *c) {
   VP9_COMMON *x = &c->common;
-  const vp9_tree_p T = vp9_bmode_tree;
-  const vp9_tree_p KT = vp9_kf_bmode_tree;
+  const vp9_tree_p KT = vp9_intra_mode_tree;
   int i, j;
 
-  for (i = 0; i < VP9_KF_BINTRAMODES; i++) {
-    for (j = 0; j < VP9_KF_BINTRAMODES; j++) {
-      vp9_cost_tokens((int *)c->mb.bmode_costs[i][j],
-                      x->kf_bmode_prob[i][j], KT);
+  for (i = 0; i < VP9_INTRA_MODES; i++) {
+    for (j = 0; j < VP9_INTRA_MODES; j++) {
+      vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j],
+                      x->kf_y_mode_prob[i][j], KT);
     }
   }
 
-  vp9_cost_tokens((int *)c->mb.inter_bmode_costs, x->fc.bmode_prob, T);
-  vp9_cost_tokens((int *)c->mb.inter_bmode_costs,
-                  x->fc.sub_mv_ref_prob[0], vp9_sub_mv_ref_tree);
-
   // TODO(rbultje) separate tables for superblock costing?
-  vp9_cost_tokens(c->mb.mbmode_cost[1], x->fc.ymode_prob, vp9_ymode_tree);
-  vp9_cost_tokens(c->mb.mbmode_cost[0],
-                  x->kf_ymode_prob[c->common.kf_ymode_probs_index],
-                  vp9_kf_ymode_tree);
+  vp9_cost_tokens(c->mb.mbmode_cost, x->fc.y_mode_prob[1],
+                  vp9_intra_mode_tree);
   vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
-                  x->fc.uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
+                  x->fc.uv_mode_prob[VP9_INTRA_MODES - 1], vp9_intra_mode_tree);
   vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
-                  x->kf_uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
-  vp9_cost_tokens(c->mb.i8x8_mode_costs,
-                  x->fc.i8x8_mode_prob, vp9_i8x8_mode_tree);
+                  x->kf_uv_mode_prob[VP9_INTRA_MODES - 1], vp9_intra_mode_tree);
 
   for (i = 0; i <= VP9_SWITCHABLE_FILTERS; ++i)
     vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -32,7 +32,6 @@
 #include "vp9/common/vp9_postproc.h"
 #endif
 #include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_swapyv12buffer.h"
 #include "vpx_ports/vpx_timer.h"
 
 #include "vp9/common/vp9_seg_common.h"
@@ -97,16 +96,11 @@
 FILE *keyfile;
 #endif
 
-#if 0
-extern int skip_true_count;
-extern int skip_false_count;
-#endif
 
-
 #ifdef ENTROPY_STATS
-extern int intra_mode_stats[VP9_KF_BINTRAMODES]
-                           [VP9_KF_BINTRAMODES]
-                           [VP9_KF_BINTRAMODES];
+extern int intra_mode_stats[VP9_INTRA_MODES]
+                           [VP9_INTRA_MODES]
+                           [VP9_INTRA_MODES];
 #endif
 
 #ifdef NMV_STATS
@@ -113,13 +107,12 @@
 extern void init_nmvstats();
 extern void print_nmvstats();
 #endif
-
-#if CONFIG_CODE_NONZEROCOUNT
-#ifdef NZC_STATS
-extern void init_nzcstats();
-extern void print_nzcstats();
+#ifdef MODE_STATS
+extern void init_tx_count_stats();
+extern void write_tx_count_stats();
+extern void init_switchable_interp_stats();
+extern void write_switchable_interp_stats();
 #endif
-#endif
 
 #ifdef SPEEDSTATS
 unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
@@ -128,22 +121,9 @@
 #if defined(SECTIONBITS_OUTPUT)
 extern unsigned __int64 Sectionbits[500];
 #endif
-#ifdef MODE_STATS
-extern int64_t Sectionbits[500];
-extern unsigned int y_modes[VP9_YMODES];
-extern unsigned int i8x8_modes[VP9_I8X8_MODES];
-extern unsigned int uv_modes[VP9_UV_MODES];
-extern unsigned int uv_modes_y[VP9_YMODES][VP9_UV_MODES];
-extern unsigned int b_modes[B_MODE_COUNT];
-extern unsigned int inter_y_modes[MB_MODE_COUNT];
-extern unsigned int inter_uv_modes[VP9_UV_MODES];
-extern unsigned int inter_b_modes[B_MODE_COUNT];
-#endif
 
 extern void vp9_init_quantizer(VP9_COMP *cpi);
 
-static int base_skip_false_prob[QINDEX_RANGE][3];
-
 // Tables relating active max Q to active min Q
 static int kf_low_motion_minq[QINDEX_RANGE];
 static int kf_high_motion_minq[QINDEX_RANGE];
@@ -161,6 +141,11 @@
   const double minqtarget = MIN(((x3 * maxq + x2) * maxq + x1) * maxq + c,
                                 maxq);
 
+  // Special case handling to deal with the step from q2.0
+  // down to lossless mode represented by q 1.0.
+  if (minqtarget <= 2.0)
+    return 0;
+
   for (i = 0; i < QINDEX_RANGE; i++) {
     if (minqtarget <= vp9_convert_qindex_to_q(i))
       return i;
@@ -177,15 +162,16 @@
 
 
     kf_low_motion_minq[i] = calculate_minq_index(maxq,
-                                                 0.0000003,
-                                                 -0.000015,
-                                                 0.074,
+                                                 0.000001,
+                                                 -0.0004,
+                                                 0.15,
                                                  0.0);
     kf_high_motion_minq[i] = calculate_minq_index(maxq,
-                                                  0.0000004,
-                                                  -0.000125,
-                                                  0.14,
+                                                  0.000002,
+                                                  -0.0012,
+                                                  0.5,
                                                   0.0);
+
     gf_low_motion_minq[i] = calculate_minq_index(maxq,
                                                  0.0000015,
                                                  -0.0009,
@@ -214,52 +200,7 @@
     mb->mvsadcost = mb->nmvsadcost;
   }
 }
-static void init_base_skip_probs(void) {
-  int i;
 
-  for (i = 0; i < QINDEX_RANGE; i++) {
-    const double q = vp9_convert_qindex_to_q(i);
-
-    // Exponential decay caluclation of baseline skip prob with clamping
-    // Based on crude best fit of old table.
-    const int t = (int)(564.25 * pow(2.71828, (-0.012 * q)));
-
-    base_skip_false_prob[i][1] = clip_prob(t);
-    base_skip_false_prob[i][2] = clip_prob(t * 3 / 4);
-    base_skip_false_prob[i][0] = clip_prob(t * 5 / 4);
-  }
-}
-
-static void update_base_skip_probs(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-
-  if (cm->frame_type != KEY_FRAME) {
-    vp9_update_skip_probs(cpi);
-
-    if (cpi->refresh_alt_ref_frame) {
-      int k;
-      for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-        cpi->last_skip_false_probs[2][k] = cm->mbskip_pred_probs[k];
-      cpi->last_skip_probs_q[2] = cm->base_qindex;
-    } else if (cpi->refresh_golden_frame) {
-      int k;
-      for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-        cpi->last_skip_false_probs[1][k] = cm->mbskip_pred_probs[k];
-      cpi->last_skip_probs_q[1] = cm->base_qindex;
-    } else {
-      int k;
-      for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-        cpi->last_skip_false_probs[0][k] = cm->mbskip_pred_probs[k];
-      cpi->last_skip_probs_q[0] = cm->base_qindex;
-
-      // update the baseline table for the current q
-      for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-        cpi->base_skip_false_prob[cm->base_qindex][k] =
-          cm->mbskip_pred_probs[k];
-    }
-  }
-}
-
 void vp9_initialize_enc() {
   static int init_done = 0;
 
@@ -269,21 +210,17 @@
     vp9_init_quant_tables();
     vp9_init_me_luts();
     init_minq_luts();
-    init_base_skip_probs();
+    // init_base_skip_probs();
     init_done = 1;
   }
 }
-#ifdef PACKET_TESTING
-extern FILE *vpxlogc;
-#endif
 
 static void setup_features(VP9_COMP *cpi) {
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
 
   // Set up default state for MB feature flags
+  xd->segmentation_enabled = 0;
 
-  xd->segmentation_enabled = 0;   // Default segmentation disabled
-
   xd->update_mb_segmentation_map = 0;
   xd->update_mb_segmentation_data = 0;
   vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
@@ -300,21 +237,7 @@
   set_default_lf_deltas(cpi);
 }
 
-
 static void dealloc_compressor_data(VP9_COMP *cpi) {
-  vpx_free(cpi->tplist);
-  cpi->tplist = NULL;
-
-  // Delete last frame MV storage buffers
-  vpx_free(cpi->lfmv);
-  cpi->lfmv = 0;
-
-  vpx_free(cpi->lf_ref_frame_sign_bias);
-  cpi->lf_ref_frame_sign_bias = 0;
-
-  vpx_free(cpi->lf_ref_frame);
-  cpi->lf_ref_frame = 0;
-
   // Delete sementation map
   vpx_free(cpi->segmentation_map);
   cpi->segmentation_map = 0;
@@ -326,20 +249,16 @@
   vpx_free(cpi->active_map);
   cpi->active_map = 0;
 
-  vp9_de_alloc_frame_buffers(&cpi->common);
+  vp9_free_frame_buffers(&cpi->common);
 
-  vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf);
-  vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source);
-  vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer);
+  vp9_free_frame_buffer(&cpi->last_frame_uf);
+  vp9_free_frame_buffer(&cpi->scaled_source);
+  vp9_free_frame_buffer(&cpi->alt_ref_buffer);
   vp9_lookahead_destroy(cpi->lookahead);
 
   vpx_free(cpi->tok);
   cpi->tok = 0;
 
-  // Structure used to monitor GF usage
-  vpx_free(cpi->gf_active_flags);
-  cpi->gf_active_flags = 0;
-
   // Activity mask based per mb zbin adjustments
   vpx_free(cpi->mb_activity_map);
   cpi->mb_activity_map = 0;
@@ -348,15 +267,6 @@
 
   vpx_free(cpi->mb.pip);
   cpi->mb.pip = 0;
-
-  vpx_free(cpi->twopass.total_stats);
-  cpi->twopass.total_stats = 0;
-
-  vpx_free(cpi->twopass.total_left_stats);
-  cpi->twopass.total_left_stats = 0;
-
-  vpx_free(cpi->twopass.this_frame_stats);
-  cpi->twopass.this_frame_stats = 0;
 }
 
 // Computes a q delta (in "q index" terms) to get from a starting q value
@@ -394,7 +304,7 @@
   // Disable and clear down for KF
   if (cm->frame_type == KEY_FRAME) {
     // Clear down the global segmentation map
-    vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));
+    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
     xd->update_mb_segmentation_map = 0;
     xd->update_mb_segmentation_data = 0;
     cpi->static_mb_pct = 0;
@@ -407,7 +317,7 @@
   } else if (cpi->refresh_alt_ref_frame) {
     // If this is an alt ref frame
     // Clear down the global segmentation map
-    vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));
+    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
     xd->update_mb_segmentation_map = 0;
     xd->update_mb_segmentation_data = 0;
     cpi->static_mb_pct = 0;
@@ -437,9 +347,9 @@
       xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
 
     }
-  }
-  // All other frames if segmentation has been enabled
-  else if (xd->segmentation_enabled) {
+  } else if (xd->segmentation_enabled) {
+    // All other frames if segmentation has been enabled
+
     // First normal frame in a valid gf or alt ref group
     if (cpi->common.frames_since_golden == 0) {
       // Set up segment features for normal frames in an arf group
@@ -451,7 +361,6 @@
         qi_delta = compute_qdelta(cpi, cpi->avg_q,
                                   (cpi->avg_q * 1.125));
         vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, (qi_delta + 2));
-        vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, 0);
         vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_Q);
 
         vp9_set_segdata(xd, 1, SEG_LVL_ALT_LF, -2);
@@ -459,18 +368,17 @@
 
         // Segment coding disabled for compred testing
         if (high_q || (cpi->static_mb_pct == 100)) {
-          vp9_set_segref(xd, 1, ALTREF_FRAME);
+          vp9_set_segdata(xd, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
           vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);
           vp9_enable_segfeature(xd, 1, SEG_LVL_SKIP);
         }
-      }
-      // Disable segmentation and clear down features if alt ref
-      // is not active for this group
-      else {
+      } else {
+        // Disable segmentation and clear down features if alt ref
+        // is not active for this group
+
         vp9_disable_segmentation((VP9_PTR)cpi);
 
-        vpx_memset(cpi->segmentation_map, 0,
-                   (cm->mb_rows * cm->mb_cols));
+        vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
 
         xd->update_mb_segmentation_map = 0;
         xd->update_mb_segmentation_data = 0;
@@ -477,21 +385,20 @@
 
         vp9_clearall_segfeatures(xd);
       }
-    }
+    } else if (cpi->is_src_frame_alt_ref) {
+      // Special case where we are coding over the top of a previous
+      // alt ref frame.
+      // Segment coding disabled for compred testing
 
-    // Special case where we are coding over the top of a previous
-    // alt ref frame.
-    // Segment coding disabled for compred testing
-    else if (cpi->is_src_frame_alt_ref) {
       // Enable ref frame features for segment 0 as well
       vp9_enable_segfeature(xd, 0, SEG_LVL_REF_FRAME);
       vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);
 
       // All mbs should use ALTREF_FRAME
-      vp9_clear_segref(xd, 0);
-      vp9_set_segref(xd, 0, ALTREF_FRAME);
-      vp9_clear_segref(xd, 1);
-      vp9_set_segref(xd, 1, ALTREF_FRAME);
+      vp9_clear_segdata(xd, 0, SEG_LVL_REF_FRAME);
+      vp9_set_segdata(xd, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+      vp9_clear_segdata(xd, 1, SEG_LVL_REF_FRAME);
+      vp9_set_segdata(xd, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
 
       // Skip all MBs if high Q (0,0 mv and skip coeffs)
       if (high_q) {
@@ -500,9 +407,9 @@
       }
       // Enable data udpate
       xd->update_mb_segmentation_data = 1;
-    }
-    // All other frames.
-    else {
+    } else {
+      // All other frames.
+
       // No updates.. leave things as they are.
       xd->update_mb_segmentation_map = 0;
       xd->update_mb_segmentation_data = 0;
@@ -510,6 +417,69 @@
   }
 }
 
+#ifdef ENTROPY_STATS
+void vp9_update_mode_context_stats(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  int i, j;
+  unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] =
+      cm->fc.inter_mode_counts;
+  int64_t (*mv_ref_stats)[VP9_INTER_MODES - 1][2] = cpi->mv_ref_stats;
+  FILE *f;
+
+  // Read the past stats counters
+  f = fopen("mode_context.bin",  "rb");
+  if (!f) {
+    vpx_memset(cpi->mv_ref_stats, 0, sizeof(cpi->mv_ref_stats));
+  } else {
+    fread(cpi->mv_ref_stats, sizeof(cpi->mv_ref_stats), 1, f);
+    fclose(f);
+  }
+
+  // Add in the values for this frame
+  for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
+    for (j = 0; j < VP9_INTER_MODES - 1; j++) {
+      mv_ref_stats[i][j][0] += (int64_t)inter_mode_counts[i][j][0];
+      mv_ref_stats[i][j][1] += (int64_t)inter_mode_counts[i][j][1];
+    }
+  }
+
+  // Write back the accumulated stats
+  f = fopen("mode_context.bin",  "wb");
+  fwrite(cpi->mv_ref_stats, sizeof(cpi->mv_ref_stats), 1, f);
+  fclose(f);
+}
+
+void print_mode_context(VP9_COMP *cpi) {
+  FILE *f = fopen("vp9_modecont.c", "a");
+  int i, j;
+
+  fprintf(f, "#include \"vp9_entropy.h\"\n");
+  fprintf(
+      f,
+      "const int inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1] =");
+  fprintf(f, "{\n");
+  for (j = 0; j < INTER_MODE_CONTEXTS; j++) {
+    fprintf(f, "  {/* %d */ ", j);
+    fprintf(f, "    ");
+    for (i = 0; i < VP9_INTER_MODES - 1; i++) {
+      int this_prob;
+      int64_t count = cpi->mv_ref_stats[j][i][0] + cpi->mv_ref_stats[j][i][1];
+      if (count)
+        this_prob = ((cpi->mv_ref_stats[j][i][0] * 256) + (count >> 1)) / count;
+      else
+        this_prob = 128;
+
+      // context probs
+      fprintf(f, "%5d, ", this_prob);
+    }
+    fprintf(f, "  },\n");
+  }
+
+  fprintf(f, "};\n");
+  fclose(f);
+}
+#endif  // ENTROPY_STATS
+
 // DEBUG: Print out the segment id of each MB in the current frame.
 static void print_seg_map(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
@@ -519,8 +489,8 @@
 
   fprintf(statsfile, "%10d\n", cm->current_video_frame);
 
-  for (row = 0; row < cpi->common.mb_rows; row++) {
-    for (col = 0; col < cpi->common.mb_cols; col++) {
+  for (row = 0; row < cpi->common.mi_rows; row++) {
+    for (col = 0; col < cpi->common.mi_cols; col++) {
       fprintf(statsfile, "%10d", cpi->segmentation_map[map_index]);
       map_index++;
     }
@@ -537,14 +507,13 @@
   MODE_INFO *mi, *mi_ptr = cm->mi;
   uint8_t *cache_ptr = cm->last_frame_seg_map, *cache;
 
-  for (row = 0; row < cm->mb_rows; row++) {
+  for (row = 0; row < cm->mi_rows; row++) {
     mi = mi_ptr;
     cache = cache_ptr;
-    for (col = 0; col < cm->mb_cols; col++, mi++, cache++) {
+    for (col = 0; col < cm->mi_cols; col++, mi++, cache++)
       cache[0] = mi->mbmi.segment_id;
-    }
     mi_ptr += cm->mode_info_stride;
-    cache_ptr += cm->mb_cols;
+    cache_ptr += cm->mi_cols;
   }
 }
 
@@ -561,10 +530,8 @@
   cpi->mb.e_mbd.ref_lf_deltas[GOLDEN_FRAME] = -2;
   cpi->mb.e_mbd.ref_lf_deltas[ALTREF_FRAME] = -2;
 
-  cpi->mb.e_mbd.mode_lf_deltas[0] = 4;               // BPRED
-  cpi->mb.e_mbd.mode_lf_deltas[1] = -2;              // Zero
-  cpi->mb.e_mbd.mode_lf_deltas[2] = 2;               // New mv
-  cpi->mb.e_mbd.mode_lf_deltas[3] = 4;               // Split mv
+  cpi->mb.e_mbd.mode_lf_deltas[0] = 0;              // Zero
+  cpi->mb.e_mbd.mode_lf_deltas[1] = 0;               // New mv
 }
 
 static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) {
@@ -573,9 +540,8 @@
   int i;
 
   // Set baseline threshold values
-  for (i = 0; i < MAX_MODES; ++i) {
-    sf->thresh_mult[i] = (mode == 0) ? -500 : 0;
-  }
+  for (i = 0; i < MAX_MODES; ++i)
+    sf->thresh_mult[i] = mode == 0 ? -500 : 0;
 
   sf->thresh_mult[THR_ZEROMV   ] = 0;
   sf->thresh_mult[THR_ZEROG    ] = 0;
@@ -601,7 +567,6 @@
   sf->thresh_mult[THR_D63_PRED ] += speed_multiplier * 1500;
 
   sf->thresh_mult[THR_B_PRED   ] += speed_multiplier * 2500;
-  sf->thresh_mult[THR_I8X8_PRED] += speed_multiplier * 2500;
 
   sf->thresh_mult[THR_NEWMV    ] += speed_multiplier * 1000;
   sf->thresh_mult[THR_NEWG     ] += speed_multiplier * 1000;
@@ -611,44 +576,40 @@
   sf->thresh_mult[THR_SPLITG   ] += speed_multiplier * 2500;
   sf->thresh_mult[THR_SPLITA   ] += speed_multiplier * 2500;
 
-  sf->thresh_mult[THR_COMP_ZEROLG   ] += speed_multiplier * 1500;
   sf->thresh_mult[THR_COMP_ZEROLA   ] += speed_multiplier * 1500;
   sf->thresh_mult[THR_COMP_ZEROGA   ] += speed_multiplier * 1500;
 
-  sf->thresh_mult[THR_COMP_NEARESTLG] += speed_multiplier * 1500;
   sf->thresh_mult[THR_COMP_NEARESTLA] += speed_multiplier * 1500;
   sf->thresh_mult[THR_COMP_NEARESTGA] += speed_multiplier * 1500;
 
-  sf->thresh_mult[THR_COMP_NEARLG   ] += speed_multiplier * 1500;
   sf->thresh_mult[THR_COMP_NEARLA   ] += speed_multiplier * 1500;
   sf->thresh_mult[THR_COMP_NEARGA   ] += speed_multiplier * 1500;
 
-  sf->thresh_mult[THR_COMP_NEWLG    ] += speed_multiplier * 2000;
   sf->thresh_mult[THR_COMP_NEWLA    ] += speed_multiplier * 2000;
   sf->thresh_mult[THR_COMP_NEWGA    ] += speed_multiplier * 2000;
 
   sf->thresh_mult[THR_COMP_SPLITLA  ] += speed_multiplier * 4500;
   sf->thresh_mult[THR_COMP_SPLITGA  ] += speed_multiplier * 4500;
-  sf->thresh_mult[THR_COMP_SPLITLG  ] += speed_multiplier * 4500;
 
-#if CONFIG_COMP_INTERINTRA_PRED
-  sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] += speed_multiplier * 1500;
+  if (speed > 4) {
+    for (i = 0; i < MAX_MODES; ++i)
+      sf->thresh_mult[i] = INT_MAX;
 
-  sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] += speed_multiplier * 1500;
+    sf->thresh_mult[THR_DC       ] = 0;
+    sf->thresh_mult[THR_TM       ] = 0;
+    sf->thresh_mult[THR_NEWMV    ] = 4000;
+    sf->thresh_mult[THR_NEWG     ] = 4000;
+    sf->thresh_mult[THR_NEWA     ] = 4000;
+    sf->thresh_mult[THR_NEARESTMV] = 0;
+    sf->thresh_mult[THR_NEARESTG ] = 0;
+    sf->thresh_mult[THR_NEARESTA ] = 0;
+    sf->thresh_mult[THR_NEARMV   ] = 2000;
+    sf->thresh_mult[THR_NEARG    ] = 2000;
+    sf->thresh_mult[THR_NEARA    ] = 2000;
+    sf->thresh_mult[THR_COMP_NEARESTLA] = 2000;
+    sf->recode_loop = 0;
+  }
 
-  sf->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_COMP_INTERINTRA_NEARG   ] += speed_multiplier * 1500;
-  sf->thresh_mult[THR_COMP_INTERINTRA_NEARA   ] += speed_multiplier * 1500;
-
-  sf->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] += speed_multiplier * 2000;
-  sf->thresh_mult[THR_COMP_INTERINTRA_NEWG    ] += speed_multiplier * 2000;
-  sf->thresh_mult[THR_COMP_INTERINTRA_NEWA    ] += speed_multiplier * 2000;
-#endif
-
   /* disable frame modes if flags not set */
   if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {
     sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
@@ -656,12 +617,6 @@
     sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
     sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
     sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
-#if CONFIG_COMP_INTERINTRA_PRED
-    sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] = INT_MAX;
-#endif
   }
   if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
     sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
@@ -669,12 +624,6 @@
     sf->thresh_mult[THR_NEARG    ] = INT_MAX;
     sf->thresh_mult[THR_NEWG     ] = INT_MAX;
     sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
-#if CONFIG_COMP_INTERINTRA_PRED
-    sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARG   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEWG    ] = INT_MAX;
-#endif
   }
   if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) {
     sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
@@ -682,22 +631,8 @@
     sf->thresh_mult[THR_NEARA    ] = INT_MAX;
     sf->thresh_mult[THR_NEWA     ] = INT_MAX;
     sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
-#if CONFIG_COMP_INTERINTRA_PRED
-    sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARA   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_INTERINTRA_NEWA    ] = INT_MAX;
-#endif
   }
 
-  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) !=
-      (VP9_LAST_FLAG | VP9_GOLD_FLAG)) {
-    sf->thresh_mult[THR_COMP_ZEROLG   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARESTLG] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEARLG   ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_NEWLG    ] = INT_MAX;
-    sf->thresh_mult[THR_COMP_SPLITLG  ] = INT_MAX;
-  }
   if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) !=
       (VP9_LAST_FLAG | VP9_ALT_FLAG)) {
     sf->thresh_mult[THR_COMP_ZEROLA   ] = INT_MAX;
@@ -719,7 +654,7 @@
 void vp9_set_speed_features(VP9_COMP *cpi) {
   SPEED_FEATURES *sf = &cpi->sf;
   int mode = cpi->compressor_speed;
-  int speed = cpi->Speed;
+  int speed = cpi->speed;
   int i;
 
   // Only modes 0 and 1 supported for now in experimental code basae
@@ -736,24 +671,24 @@
   // best quality defaults
   sf->RD = 1;
   sf->search_method = NSTEP;
-  sf->improved_dct = 1;
   sf->auto_filter = 1;
   sf->recode_loop = 1;
   sf->quarter_pixel_search = 1;
   sf->half_pixel_search = 1;
   sf->iterative_sub_pixel = 1;
-  sf->no_skip_block4x4_search = 1;
-  if (cpi->oxcf.lossless)
-    sf->optimize_coefficients = 0;
-  else
-    sf->optimize_coefficients = 1;
-
+  sf->optimize_coefficients = !cpi->oxcf.lossless;
   sf->first_step = 0;
   sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
-  sf->static_segmentation = 1;
-  sf->splitmode_breakout = 0;
-  sf->mb16_breakout = 0;
+  sf->comp_inter_joint_search_thresh = BLOCK_SIZE_AB4X4;
+  sf->adpative_rd_thresh = 0;
 
+#if CONFIG_MULTIPLE_ARF
+  // Switch segmentation off.
+  sf->static_segmentation = 0;
+#else
+  sf->static_segmentation = 0;
+#endif
+
   switch (mode) {
     case 0: // best quality mode
       sf->search_best_filter = SEARCH_BEST_FILTER;
@@ -760,52 +695,19 @@
       break;
 
     case 1:
-      sf->static_segmentation = 1;
-      sf->splitmode_breakout = 1;
-      sf->mb16_breakout = 0;
-
+#if CONFIG_MULTIPLE_ARF
+      // Switch segmentation off.
+      sf->static_segmentation = 0;
+#else
+      sf->static_segmentation = 0;
+#endif
+      sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+      sf->adpative_rd_thresh = 1;
       if (speed > 0) {
-        /* Disable coefficient optimization above speed 0 */
+        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
         sf->optimize_coefficients = 0;
-        sf->no_skip_block4x4_search = 0;
-
         sf->first_step = 1;
-
-        cpi->mode_check_freq[THR_SPLITG] = 2;
-        cpi->mode_check_freq[THR_SPLITA] = 2;
-        cpi->mode_check_freq[THR_SPLITMV] = 0;
-
-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 2;
-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 2;
-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 0;
       }
-
-      if (speed > 1) {
-        cpi->mode_check_freq[THR_SPLITG] = 4;
-        cpi->mode_check_freq[THR_SPLITA] = 4;
-        cpi->mode_check_freq[THR_SPLITMV] = 2;
-
-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 4;
-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 4;
-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 2;
-      }
-
-      if (speed > 2) {
-        cpi->mode_check_freq[THR_SPLITG] = 15;
-        cpi->mode_check_freq[THR_SPLITA] = 15;
-        cpi->mode_check_freq[THR_SPLITMV] = 7;
-
-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 15;
-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 15;
-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 7;
-
-        sf->improved_dct = 0;
-
-        // Only do recode loop on key frames, golden frames and
-        // alt ref frames
-        sf->recode_loop = 2;
-      }
-
       break;
 
   }; /* switch */
@@ -817,7 +719,6 @@
   // so make sure they are always turned off.
   if (cpi->pass == 1) {
     sf->optimize_coefficients = 0;
-    sf->improved_dct = 0;
   }
 
   cpi->mb.fwd_txm16x16  = vp9_short_fdct16x16;
@@ -830,9 +731,6 @@
   }
 
   cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;
-  cpi->mb.quantize_b_4x4_pair = vp9_regular_quantize_b_4x4_pair;
-  cpi->mb.quantize_b_8x8      = vp9_regular_quantize_b_8x8;
-  cpi->mb.quantize_b_16x16    = vp9_regular_quantize_b_16x16;
 
   vp9_init_quantizer(cpi);
 
@@ -844,26 +742,27 @@
     cpi->find_fractional_mv_step = vp9_find_best_half_pixel_step;
   }
 
-  if (cpi->sf.optimize_coefficients == 1 && cpi->pass != 1)
-    cpi->mb.optimize = 1;
-  else
-    cpi->mb.optimize = 0;
+  cpi->mb.optimize = cpi->sf.optimize_coefficients == 1 && cpi->pass != 1;
 
 #ifdef SPEEDSTATS
-  frames_at_speed[cpi->Speed]++;
+  frames_at_speed[cpi->speed]++;
 #endif
 }
 
 static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+
   cpi->lookahead = vp9_lookahead_init(cpi->oxcf.width, cpi->oxcf.height,
+                                      cm->subsampling_x, cm->subsampling_y,
                                       cpi->oxcf.lag_in_frames);
   if (!cpi->lookahead)
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate lag buffers");
 
-  if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer,
-                                  cpi->oxcf.width, cpi->oxcf.height,
-                                  VP9BORDERINPIXELS))
+  if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
+                               cpi->oxcf.width, cpi->oxcf.height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9BORDERINPIXELS))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate altref buffer");
 }
@@ -871,8 +770,8 @@
 static int alloc_partition_data(VP9_COMP *cpi) {
   vpx_free(cpi->mb.pip);
 
-  cpi->mb.pip = vpx_calloc((cpi->common.mb_cols + 1) *
-                           (cpi->common.mb_rows + 1),
+  cpi->mb.pip = vpx_calloc((cpi->common.mode_info_stride) *
+                           (cpi->common.mi_rows + 64 / MI_SIZE),
                            sizeof(PARTITION_INFO));
   if (!cpi->mb.pip)
     return 1;
@@ -893,13 +792,17 @@
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate partition data");
 
-  if (vp8_yv12_alloc_frame_buffer(&cpi->last_frame_uf,
-                                  cm->width, cm->height, VP9BORDERINPIXELS))
+  if (vp9_alloc_frame_buffer(&cpi->last_frame_uf,
+                             cm->width, cm->height,
+                             cm->subsampling_x, cm->subsampling_y,
+                             VP9BORDERINPIXELS))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
 
-  if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source,
-                                  cm->width, cm->height, VP9BORDERINPIXELS))
+  if (vp9_alloc_frame_buffer(&cpi->scaled_source,
+                             cm->width, cm->height,
+                             cm->subsampling_x, cm->subsampling_y,
+                             VP9BORDERINPIXELS))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
 
@@ -906,7 +809,7 @@
   vpx_free(cpi->tok);
 
   {
-    unsigned int tokens = cm->mb_rows * cm->mb_cols * (24 * 16 + 1);
+    unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
 
     CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
   }
@@ -916,13 +819,6 @@
   cpi->gf_bad_count = 0;
   cpi->gf_update_recommended = 0;
 
-
-  // Structures used to minitor GF usage
-  vpx_free(cpi->gf_active_flags);
-  CHECK_MEM_ERROR(cpi->gf_active_flags,
-                  vpx_calloc(1, cm->mb_rows * cm->mb_cols));
-  cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
-
   vpx_free(cpi->mb_activity_map);
   CHECK_MEM_ERROR(cpi->mb_activity_map,
                   vpx_calloc(sizeof(unsigned int),
@@ -932,28 +828,6 @@
   CHECK_MEM_ERROR(cpi->mb_norm_activity_map,
                   vpx_calloc(sizeof(unsigned int),
                              cm->mb_rows * cm->mb_cols));
-
-  vpx_free(cpi->twopass.total_stats);
-
-  cpi->twopass.total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-  vpx_free(cpi->twopass.total_left_stats);
-  cpi->twopass.total_left_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-  vpx_free(cpi->twopass.this_frame_stats);
-
-  cpi->twopass.this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-  if (!cpi->twopass.total_stats ||
-      !cpi->twopass.total_left_stats ||
-      !cpi->twopass.this_frame_stats)
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate firstpass stats");
-
-  vpx_free(cpi->tplist);
-
-  CHECK_MEM_ERROR(cpi->tplist,
-                  vpx_malloc(sizeof(TOKENLIST) * (cpi->common.mb_rows)));
 }
 
 
@@ -960,30 +834,20 @@
 static void update_frame_size(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
 
-  /* our internal buffers are always multiples of 16 */
-  int aligned_width = (cm->width + 15) & ~15;
-  int aligned_height = (cm->height + 15) & ~15;
+  vp9_update_frame_size(cm);
 
-  cm->mb_rows = aligned_height >> 4;
-  cm->mb_cols = aligned_width >> 4;
-  cm->MBs = cm->mb_rows * cm->mb_cols;
-  cm->mode_info_stride = cm->mb_cols + 1;
-  memset(cm->mip, 0,
-        (cm->mb_cols + 1) * (cm->mb_rows + 1) * sizeof(MODE_INFO));
-  vp9_update_mode_info_border(cm, cm->mip);
-
-  cm->mi = cm->mip + cm->mode_info_stride + 1;
-  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
-  vp9_update_mode_info_in_image(cm, cm->mi);
-
-  /* Update size of buffers local to this frame */
-  if (vp8_yv12_realloc_frame_buffer(&cpi->last_frame_uf,
-                                    cm->width, cm->height, VP9BORDERINPIXELS))
+  // Update size of buffers local to this frame
+  if (vp9_realloc_frame_buffer(&cpi->last_frame_uf,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9BORDERINPIXELS))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to reallocate last frame buffer");
 
-  if (vp8_yv12_realloc_frame_buffer(&cpi->scaled_source,
-                                    cm->width, cm->height, VP9BORDERINPIXELS))
+  if (vp9_realloc_frame_buffer(&cpi->scaled_source,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9BORDERINPIXELS))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to reallocate scaled source buffer");
 
@@ -1026,7 +890,7 @@
   return 63;
 };
 void vp9_new_frame_rate(VP9_COMP *cpi, double framerate) {
-  if (framerate < .1)
+  if (framerate < 0.1)
     framerate = 30;
 
   cpi->oxcf.frame_rate             = framerate;
@@ -1035,9 +899,9 @@
   cpi->av_per_frame_bandwidth        = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
   cpi->min_frame_bandwidth          = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
 
-  if (cpi->min_frame_bandwidth < FRAME_OVERHEAD_BITS)
-    cpi->min_frame_bandwidth = FRAME_OVERHEAD_BITS;
 
+  cpi->min_frame_bandwidth = MAX(cpi->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
+
   // Set Maximum gf/arf interval
   cpi->max_gf_interval = 16;
 
@@ -1074,10 +938,10 @@
 
   vp9_get_tile_n_bits(cm, &min_log2_tiles, &max_log2_tiles);
   max_log2_tiles += min_log2_tiles;
-  if (cm->log2_tile_columns < min_log2_tiles)
-    cm->log2_tile_columns = min_log2_tiles;
-  else if (cm->log2_tile_columns > max_log2_tiles)
-    cm->log2_tile_columns = max_log2_tiles;
+
+  cm->log2_tile_columns = clamp(cm->log2_tile_columns,
+                                min_log2_tiles, max_log2_tiles);
+
   cm->tile_columns = 1 << cm->log2_tile_columns;
   cm->tile_rows = 1 << cm->log2_tile_rows;
 }
@@ -1085,16 +949,18 @@
 static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   VP9_COMP *cpi = (VP9_COMP *)(ptr);
   VP9_COMMON *const cm = &cpi->common;
+  int i;
 
   cpi->oxcf = *oxcf;
-
   cpi->goldfreq = 7;
 
   cm->version = oxcf->version;
-  vp9_setup_version(cm);
 
   cm->width = oxcf->width;
   cm->height = oxcf->height;
+  cm->subsampling_x = 0;
+  cm->subsampling_y = 0;
+  vp9_alloc_compressor_data(cpi);
 
   // change includes all joint functionality
   vp9_change_config(ptr, oxcf);
@@ -1124,12 +990,9 @@
 
   set_tile_limits(cpi);
 
-  {
-    int i;
-    cpi->fixed_divide[0] = 0;
-    for (i = 1; i < 512; i++)
-      cpi->fixed_divide[i] = 0x80000 / i;
-  }
+  cpi->fixed_divide[0] = 0;
+  for (i = 1; i < 512; i++)
+    cpi->fixed_divide[i] = 0x80000 / i;
 }
 
 
@@ -1142,7 +1005,6 @@
 
   if (cm->version != oxcf->version) {
     cm->version = oxcf->version;
-    vp9_setup_version(cm);
   }
 
   cpi->oxcf = *oxcf;
@@ -1157,13 +1019,7 @@
     case MODE_SECONDPASS:
       cpi->pass = 2;
       cpi->compressor_speed = 1;
-
-      if (cpi->oxcf.cpu_used < -5) {
-        cpi->oxcf.cpu_used = -5;
-      }
-
-      if (cpi->oxcf.cpu_used > 5)
-        cpi->oxcf.cpu_used = 5;
+      cpi->oxcf.cpu_used = clamp(cpi->oxcf.cpu_used, -5, 5);
       break;
 
     case MODE_SECONDPASS_BEST:
@@ -1178,11 +1034,11 @@
 
   cpi->oxcf.lossless = oxcf->lossless;
   if (cpi->oxcf.lossless) {
-    cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_iwalsh4x4_1;
-    cpi->mb.e_mbd.inv_txm4x4   = vp9_short_iwalsh4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_iwalsh4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_iwalsh4x4_add;
   } else {
-    cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_idct4x4_1;
-    cpi->mb.e_mbd.inv_txm4x4   = vp9_short_idct4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_idct4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_idct4x4_add;
   }
 
   cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
@@ -1193,7 +1049,8 @@
   // cpi->use_last_frame_only = 0;
   cpi->refresh_golden_frame = 0;
   cpi->refresh_last_frame = 1;
-  cm->refresh_entropy_probs = 1;
+  cm->refresh_frame_context = 1;
+  cm->reset_frame_context = 0;
 
   setup_features(cpi);
   cpi->mb.e_mbd.allow_high_precision_mv = 0;   // Default mv precision adaptation
@@ -1207,8 +1064,7 @@
   }
 
   // At the moment the first order values may not be > MAXQ
-  if (cpi->oxcf.fixed_q > MAXQ)
-    cpi->oxcf.fixed_q = MAXQ;
+  cpi->oxcf.fixed_q = MIN(cpi->oxcf.fixed_q, MAXQ);
 
   // local file playback mode == really big buffer
   if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK) {
@@ -1244,29 +1100,19 @@
   cpi->best_quality = cpi->oxcf.best_allowed_q;
 
   // active values should only be modified if out of new range
-  if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q) {
-    cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
-  }
-  // less likely
-  else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q) {
-    cpi->active_worst_quality = cpi->oxcf.best_allowed_q;
-  }
-  if (cpi->active_best_quality < cpi->oxcf.best_allowed_q) {
-    cpi->active_best_quality = cpi->oxcf.best_allowed_q;
-  }
-  // less likely
-  else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q) {
-    cpi->active_best_quality = cpi->oxcf.worst_allowed_q;
-  }
+  cpi->active_worst_quality = clamp(cpi->active_worst_quality,
+                                    cpi->oxcf.best_allowed_q,
+                                    cpi->oxcf.worst_allowed_q);
 
-  cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
+  cpi->active_best_quality = clamp(cpi->active_best_quality,
+                                   cpi->oxcf.best_allowed_q,
+                                   cpi->oxcf.worst_allowed_q);
 
+  cpi->buffered_mode = cpi->oxcf.optimal_buffer_level > 0;
+
   cpi->cq_target_quality = cpi->oxcf.cq_level;
 
-  if (!cm->use_bilinear_mc_filter)
-    cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
-  else
-    cm->mcomp_filter_type = BILINEAR;
+  cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
 
   cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
 
@@ -1274,22 +1120,17 @@
   cm->display_height = cpi->oxcf.height;
 
   // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
-  if (cpi->oxcf.Sharpness > 7)
-    cpi->oxcf.Sharpness = 7;
+  cpi->oxcf.Sharpness = MIN(7, cpi->oxcf.Sharpness);
 
   cm->sharpness_level = cpi->oxcf.Sharpness;
 
-  // Increasing the size of the frame beyond the first seen frame, or some
-  // otherwise signalled maximum size, is not supported.
-  // TODO(jkoleszar): exit gracefully.
-  if (!cpi->initial_width) {
-    alloc_raw_frame_buffers(cpi);
-    vp9_alloc_compressor_data(cpi);
-    cpi->initial_width = cm->width;
-    cpi->initial_height = cm->height;
+  if (cpi->initial_width) {
+    // Increasing the size of the frame beyond the first seen frame, or some
+    // otherwise signalled maximum size, is not supported.
+    // TODO(jkoleszar): exit gracefully.
+    assert(cm->width <= cpi->initial_width);
+    assert(cm->height <= cpi->initial_height);
   }
-  assert(cm->width <= cpi->initial_width);
-  assert(cm->height <= cpi->initial_height);
   update_frame_size(cpi);
 
   if (cpi->oxcf.fixed_q >= 0) {
@@ -1298,18 +1139,22 @@
     cpi->last_boosted_qindex = cpi->oxcf.fixed_q;
   }
 
-  cpi->Speed = cpi->oxcf.cpu_used;
+  cpi->speed = cpi->oxcf.cpu_used;
 
-  // force to allowlag to 0 if lag_in_frames is 0;
   if (cpi->oxcf.lag_in_frames == 0) {
+    // force to allowlag to 0 if lag_in_frames is 0;
     cpi->oxcf.allow_lag = 0;
-  }
-  // Limit on lag buffers as these are not currently dynamically allocated
-  else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
+  } else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS) {
+     // Limit on lag buffers as these are not currently dynamically allocated
     cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
+  }
 
   // YX Temp
+#if CONFIG_MULTIPLE_ARF
+  vp9_zero(cpi->alt_ref_source);
+#else
   cpi->alt_ref_source = NULL;
+#endif
   cpi->is_src_frame_alt_ref = 0;
 
 #if 0
@@ -1396,30 +1241,13 @@
 
   init_config((VP9_PTR)cpi, oxcf);
 
-  memcpy(cpi->base_skip_false_prob, base_skip_false_prob, sizeof(base_skip_false_prob));
   cpi->common.current_video_frame   = 0;
   cpi->kf_overspend_bits            = 0;
   cpi->kf_bitrate_adjustment        = 0;
-  cpi->frames_till_gf_update_due      = 0;
+  cpi->frames_till_gf_update_due    = 0;
   cpi->gf_overspend_bits            = 0;
-  cpi->non_gf_bitrate_adjustment     = 0;
-  cm->prob_last_coded               = 128;
-  cm->prob_gf_coded                 = 128;
-  cm->prob_intra_coded              = 63;
-  cm->sb32_coded                    = 200;
-  cm->sb64_coded                    = 200;
-  for (i = 0; i < COMP_PRED_CONTEXTS; i++)
-    cm->prob_comppred[i]         = 128;
-  for (i = 0; i < TX_SIZE_MAX_SB - 1; i++)
-    cm->prob_tx[i]               = 128;
+  cpi->non_gf_bitrate_adjustment    = 0;
 
-  // Prime the recent reference frame useage counters.
-  // Hereafter they will be maintained as a sort of moving average
-  cpi->recent_ref_frame_usage[INTRA_FRAME]  = 1;
-  cpi->recent_ref_frame_usage[LAST_FRAME]   = 1;
-  cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
-  cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
-
   // Set reference frame sign bias for ALTREF frame to 1 (for now)
   cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
 
@@ -1429,22 +1257,18 @@
   cpi->alt_is_last  = 0;
   cpi->gold_is_alt  = 0;
 
-  // allocate memory for storing last frame's MVs for MV prediction.
-  CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int_mv)));
-  CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int)));
-  CHECK_MEM_ERROR(cpi->lf_ref_frame, vpx_calloc((cpi->common.mb_rows + 2) * (cpi->common.mb_cols + 2), sizeof(int)));
-
   // Create the encoder segmentation map and set all entries to 0
-  CHECK_MEM_ERROR(cpi->segmentation_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
+  CHECK_MEM_ERROR(cpi->segmentation_map,
+                  vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, 1));
 
   // And a copy in common for temporal coding
   CHECK_MEM_ERROR(cm->last_frame_seg_map,
-                  vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
+                  vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, 1));
 
   // And a place holder structure is the coding context
   // for use if we want to save and restore it
   CHECK_MEM_ERROR(cpi->coding_context.last_frame_seg_map_copy,
-                  vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
+                  vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, 1));
 
   CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
   vpx_memset(cpi->active_map, 1, (cpi->common.mb_rows * cpi->common.mb_cols));
@@ -1462,24 +1286,14 @@
   if (cpi->pass != 1)
     init_context_counters();
 #endif
-#ifdef MODE_STATS
-  vp9_zero(y_modes);
-  vp9_zero(i8x8_modes);
-  vp9_zero(uv_modes);
-  vp9_zero(uv_modes_y);
-  vp9_zero(b_modes);
-  vp9_zero(inter_y_modes);
-  vp9_zero(inter_uv_modes);
-  vp9_zero(inter_b_modes);
-#endif
+
 #ifdef NMV_STATS
   init_nmvstats();
 #endif
-#if CONFIG_CODE_NONZEROCOUNT
-#ifdef NZC_STATS
-  init_nzcstats();
+#ifdef MODE_STATS
+  init_tx_count_stats();
+  init_switchable_interp_stats();
 #endif
-#endif
 
   /*Initialize the feed-forward activity masking.*/
   cpi->activity_avg = 90 << 12;
@@ -1486,13 +1300,26 @@
 
   cpi->frames_since_key = 8;        // Give a sensible default for the first frame.
   cpi->key_frame_frequency = cpi->oxcf.key_freq;
-  cpi->this_key_frame_forced = FALSE;
-  cpi->next_key_frame_forced = FALSE;
+  cpi->this_key_frame_forced = 0;
+  cpi->next_key_frame_forced = 0;
 
-  cpi->source_alt_ref_pending = FALSE;
-  cpi->source_alt_ref_active = FALSE;
+  cpi->source_alt_ref_pending = 0;
+  cpi->source_alt_ref_active = 0;
   cpi->refresh_alt_ref_frame = 0;
 
+#if CONFIG_MULTIPLE_ARF
+  // Turn multiple ARF usage on/off. This is a quick hack for the initial test
+  // version. It should eventually be set via the codec API.
+  cpi->multi_arf_enabled = 1;
+
+  if (cpi->multi_arf_enabled) {
+    cpi->sequence_number = 0;
+    cpi->frame_coding_order_period = 0;
+    vp9_zero(cpi->frame_coding_order);
+    vp9_zero(cpi->arf_buffer_idx);
+  }
+#endif
+
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
   cpi->b_calculate_ssimg = 0;
@@ -1514,6 +1341,8 @@
     cpi->tot_recode_hits = 0;
     cpi->summed_quality = 0;
     cpi->summed_weights = 0;
+    cpi->summedp_quality = 0;
+    cpi->summedp_weights = 0;
   }
 
   if (cpi->b_calculate_ssimg) {
@@ -1555,9 +1384,8 @@
   cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX];
   cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
 
-  for (i = 0; i < KEY_FRAME_CONTEXT; i++) {
+  for (i = 0; i < KEY_FRAME_CONTEXT; i++)
     cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;
-  }
 
 #ifdef OUTPUT_YUV_SRC
   yuv_file = fopen("bd.yuv", "ab");
@@ -1589,14 +1417,14 @@
   vp9_set_speed_features(cpi);
 
   // Set starting values of RD threshold multipliers (128 = *1)
-  for (i = 0; i < MAX_MODES; i++) {
+  for (i = 0; i < MAX_MODES; i++)
     cpi->rd_thresh_mult[i] = 128;
-  }
 
-#define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \
+#define BFP(BT, SDF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF)\
     cpi->fn_ptr[BT].sdf            = SDF; \
     cpi->fn_ptr[BT].vf             = VF; \
     cpi->fn_ptr[BT].svf            = SVF; \
+    cpi->fn_ptr[BT].svaf           = SVAF; \
     cpi->fn_ptr[BT].svf_halfpix_h  = SVFHH; \
     cpi->fn_ptr[BT].svf_halfpix_v  = SVFHV; \
     cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \
@@ -1604,33 +1432,69 @@
     cpi->fn_ptr[BT].sdx8f          = SDX8F; \
     cpi->fn_ptr[BT].sdx4df         = SDX4DF;
 
+  BFP(BLOCK_32X16, vp9_sad32x16, vp9_variance32x16, vp9_sub_pixel_variance32x16,
+      vp9_sub_pixel_avg_variance32x16, NULL, NULL,
+      NULL, NULL, NULL,
+      vp9_sad32x16x4d)
 
+  BFP(BLOCK_16X32, vp9_sad16x32, vp9_variance16x32, vp9_sub_pixel_variance16x32,
+      vp9_sub_pixel_avg_variance16x32, NULL, NULL,
+      NULL, NULL, NULL,
+      vp9_sad16x32x4d)
+
+  BFP(BLOCK_64X32, vp9_sad64x32, vp9_variance64x32, vp9_sub_pixel_variance64x32,
+      vp9_sub_pixel_avg_variance64x32, NULL, NULL,
+      NULL, NULL, NULL,
+      vp9_sad64x32x4d)
+
+  BFP(BLOCK_32X64, vp9_sad32x64, vp9_variance32x64, vp9_sub_pixel_variance32x64,
+      vp9_sub_pixel_avg_variance32x64, NULL, NULL,
+      NULL, NULL, NULL,
+      vp9_sad32x64x4d)
+
   BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
-      vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v,
+      vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h,
+      vp9_variance_halfpixvar32x32_v,
       vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
       vp9_sad32x32x4d)
 
   BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64,
-      vp9_variance_halfpixvar64x64_h, vp9_variance_halfpixvar64x64_v,
+      vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h,
+      vp9_variance_halfpixvar64x64_v,
       vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
       vp9_sad64x64x4d)
 
   BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
-       vp9_variance_halfpixvar16x16_h, vp9_variance_halfpixvar16x16_v,
-       vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
-       vp9_sad16x16x4d)
+      vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h,
+      vp9_variance_halfpixvar16x16_v,
+      vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
+      vp9_sad16x16x4d)
 
   BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,
-      NULL, NULL, NULL, vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
+      vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL,
+      vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
 
   BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,
-      NULL, NULL, NULL, vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
+      vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL,
+      vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
 
   BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
-      NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
+      vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,
+      vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
 
+  BFP(BLOCK_8X4, vp9_sad8x4, vp9_variance8x4, vp9_sub_pixel_variance8x4,
+      vp9_sub_pixel_avg_variance8x4, NULL, NULL,
+      NULL, NULL, vp9_sad8x4x8,
+      vp9_sad8x4x4d)
+
+  BFP(BLOCK_4X8, vp9_sad4x8, vp9_variance4x8, vp9_sub_pixel_variance4x8,
+      vp9_sub_pixel_avg_variance4x8, NULL, NULL,
+      NULL, NULL, vp9_sad4x8x8,
+      vp9_sad4x8x4d)
+
   BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
-      NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
+      vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,
+      vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
 
   cpi->full_search_sad = vp9_full_search_sad;
   cpi->diamond_search_sad = vp9_diamond_search_sad;
@@ -1651,13 +1515,6 @@
   cpi->common.error.setjmp = 0;
 
   vp9_zero(cpi->y_uv_mode_count)
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_zero(cm->fc.nzc_counts_4x4);
-  vp9_zero(cm->fc.nzc_counts_8x8);
-  vp9_zero(cm->fc.nzc_counts_16x16);
-  vp9_zero(cm->fc.nzc_counts_32x32);
-  vp9_zero(cm->fc.nzc_pcat_counts);
-#endif
 
   return (VP9_PTR) cpi;
 }
@@ -1678,7 +1535,7 @@
     if (cpi->pass != 1) {
       print_context_counters();
       print_tree_update_probs();
-      print_mode_context(&cpi->common);
+      print_mode_context(cpi);
     }
 #endif
 #ifdef NMV_STATS
@@ -1685,12 +1542,12 @@
     if (cpi->pass != 1)
       print_nmvstats();
 #endif
-#if CONFIG_CODE_NONZEROCOUNT
-#ifdef NZC_STATS
-    if (cpi->pass != 1)
-      print_nzcstats();
+#ifdef MODE_STATS
+    if (cpi->pass != 1) {
+      write_tx_count_stats();
+      write_switchable_interp_stats();
+    }
 #endif
-#endif
 
 #if CONFIG_INTERNAL_STATS
 
@@ -1703,24 +1560,29 @@
                              - cpi->first_time_stamp_ever) / 10000000.000;
       double total_encode_time = (cpi->time_receive_data + cpi->time_compress_data)   / 1000.000;
       double dr = (double)cpi->bytes * (double) 8 / (double)1000  / time_encoded;
-#if defined(MODE_STATS)
-      print_mode_contexts(&cpi->common);
-#endif
+
       if (cpi->b_calculate_psnr) {
         YV12_BUFFER_CONFIG *lst_yv12 =
             &cpi->common.yv12_fb[cpi->common.ref_frame_map[cpi->lst_fb_idx]];
-        double samples = 3.0 / 2 * cpi->count * lst_yv12->y_width * lst_yv12->y_height;
+        double samples = 3.0 / 2 * cpi->count *
+                         lst_yv12->y_width * lst_yv12->y_height;
         double total_psnr = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error);
         double total_psnr2 = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error2);
-        double total_ssim = 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);
+        double total_ssim = 100 * pow(cpi->summed_quality /
+                                      cpi->summed_weights, 8.0);
+        double total_ssimp = 100 * pow(cpi->summedp_quality /
+                                       cpi->summedp_weights, 8.0);
 
-        fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\tVPXSSIM\t  Time(ms)\n");
-        fprintf(f, "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n",
-                dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim,
+        fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
+                "VPXSSIM\tVPSSIMP\t  Time(ms)\n");
+        fprintf(f, "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n",
+                dr, cpi->total / cpi->count, total_psnr,
+                cpi->totalp / cpi->count, total_psnr2, total_ssim, total_ssimp,
                 total_encode_time);
-//                fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f %10ld\n",
-//                        dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim,
-//                        total_encode_time, cpi->tot_recode_hits);
+//         fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f %10ld\n",
+//                 dr, cpi->total / cpi->count, total_psnr,
+//                 cpi->totalp / cpi->count, total_psnr2, total_ssim,
+//                 total_encode_time, cpi->tot_recode_hits);
       }
 
       if (cpi->b_calculate_ssimg) {
@@ -1738,88 +1600,6 @@
 
 #endif
 
-
-#ifdef MODE_STATS
-    {
-      extern int count_mb_seg[4];
-      char modes_stats_file[250];
-      FILE *f;
-      double dr = (double)cpi->oxcf.frame_rate * (double)cpi->bytes * (double)8 / (double)cpi->count / (double)1000;
-      sprintf(modes_stats_file, "modes_q%03d.stt", cpi->common.base_qindex);
-      f = fopen(modes_stats_file, "w");
-      fprintf(f, "intra_mode in Intra Frames:\n");
-      {
-        int i;
-        fprintf(f, "Y: ");
-        for (i = 0; i < VP9_YMODES; i++) fprintf(f, " %8d,", y_modes[i]);
-        fprintf(f, "\n");
-      }
-      {
-        int i;
-        fprintf(f, "I8: ");
-        for (i = 0; i < VP9_I8X8_MODES; i++) fprintf(f, " %8d,", i8x8_modes[i]);
-        fprintf(f, "\n");
-      }
-      {
-        int i;
-        fprintf(f, "UV: ");
-        for (i = 0; i < VP9_UV_MODES; i++) fprintf(f, " %8d,", uv_modes[i]);
-        fprintf(f, "\n");
-      }
-      {
-        int i, j;
-        fprintf(f, "KeyFrame Y-UV:\n");
-        for (i = 0; i < VP9_YMODES; i++) {
-          fprintf(f, "%2d:", i);
-          for (j = 0; j < VP9_UV_MODES; j++) fprintf(f, "%8d, ", uv_modes_y[i][j]);
-          fprintf(f, "\n");
-        }
-      }
-      {
-        int i, j;
-        fprintf(f, "Inter Y-UV:\n");
-        for (i = 0; i < VP9_YMODES; i++) {
-          fprintf(f, "%2d:", i);
-          for (j = 0; j < VP9_UV_MODES; j++) fprintf(f, "%8d, ", cpi->y_uv_mode_count[i][j]);
-          fprintf(f, "\n");
-        }
-      }
-      {
-        int i;
-
-        fprintf(f, "B: ");
-        for (i = 0; i < VP9_NKF_BINTRAMODES; i++)
-          fprintf(f, "%8d, ", b_modes[i]);
-
-        fprintf(f, "\n");
-
-      }
-
-      fprintf(f, "Modes in Inter Frames:\n");
-      {
-        int i;
-        fprintf(f, "Y: ");
-        for (i = 0; i < MB_MODE_COUNT; i++) fprintf(f, " %8d,", inter_y_modes[i]);
-        fprintf(f, "\n");
-      }
-      {
-        int i;
-        fprintf(f, "UV: ");
-        for (i = 0; i < VP9_UV_MODES; i++) fprintf(f, " %8d,", inter_uv_modes[i]);
-        fprintf(f, "\n");
-      }
-      {
-        int i;
-        fprintf(f, "B: ");
-        for (i = 0; i < B_MODE_COUNT; i++) fprintf(f, "%8d, ", inter_b_modes[i]);
-        fprintf(f, "\n");
-      }
-      fprintf(f, "P:%8d, %8d, %8d, %8d\n", count_mb_seg[0], count_mb_seg[1], count_mb_seg[2], count_mb_seg[3]);
-      fprintf(f, "PB:%8d, %8d, %8d, %8d\n", inter_b_modes[LEFT4X4], inter_b_modes[ABOVE4X4], inter_b_modes[ZERO4X4], inter_b_modes[NEW4X4]);
-      fclose(f);
-    }
-#endif
-
 #ifdef ENTROPY_STATS
     {
       int i, j, k;
@@ -1827,18 +1607,18 @@
 
       fprintf(fmode, "\n#include \"vp9_entropymode.h\"\n\n");
       fprintf(fmode, "const unsigned int vp9_kf_default_bmode_counts ");
-      fprintf(fmode, "[VP9_KF_BINTRAMODES][VP9_KF_BINTRAMODES]"
-                     "[VP9_KF_BINTRAMODES] =\n{\n");
+      fprintf(fmode, "[VP9_INTRA_MODES][VP9_INTRA_MODES]"
+                     "[VP9_INTRA_MODES] =\n{\n");
 
-      for (i = 0; i < VP9_KF_BINTRAMODES; i++) {
+      for (i = 0; i < VP9_INTRA_MODES; i++) {
 
         fprintf(fmode, "    { // Above Mode :  %d\n", i);
 
-        for (j = 0; j < VP9_KF_BINTRAMODES; j++) {
+        for (j = 0; j < VP9_INTRA_MODES; j++) {
 
           fprintf(fmode, "        {");
 
-          for (k = 0; k < VP9_KF_BINTRAMODES; k++) {
+          for (k = 0; k < VP9_INTRA_MODES; k++) {
             if (!intra_mode_stats[i][j][k])
               fprintf(fmode, " %5d, ", 1);
             else
@@ -1988,8 +1768,8 @@
   pkt.data.psnr.samples[0] = width * height;
   pkt.data.psnr.samples[1] = width * height;
 
-  width = (width + 1) / 2;
-  height = (height + 1) / 2;
+  width = orig->uv_width;
+  height = orig->uv_height;
 
   sse = calc_plane_error(orig->u_buffer, orig->uv_stride,
                          recon->u_buffer, recon->uv_stride,
@@ -2098,10 +1878,7 @@
   return 0;
 }
 int vp9_update_entropy(VP9_PTR comp, int update) {
-  VP9_COMP *cpi = (VP9_COMP *) comp;
-  VP9_COMMON *cm = &cpi->common;
-  cm->refresh_entropy_probs = update;
-
+  ((VP9_COMP *)comp)->common.refresh_frame_context = update;
   return 0;
 }
 
@@ -2146,7 +1923,7 @@
   } while (--h);
 
   src = s->u_buffer;
-  h = (cm->height + 1) / 2;
+  h = s->uv_height;
 
   do {
     fwrite(src, s->uv_width, 1,  yuv_rec_file);
@@ -2154,12 +1931,24 @@
   } while (--h);
 
   src = s->v_buffer;
-  h = (cm->height + 1) / 2;
+  h = s->uv_height;
 
   do {
     fwrite(src, s->uv_width, 1, yuv_rec_file);
     src += s->uv_stride;
   } while (--h);
+
+#if CONFIG_ALPHA
+  if (s->alpha_buffer) {
+    src = s->alpha_buffer;
+    h = s->alpha_height;
+    do {
+      fwrite(src, s->alpha_width, 1,  yuv_rec_file);
+      src += s->alpha_stride;
+    } while (--h);
+  }
+#endif
+
   fflush(yuv_rec_file);
 }
 #endif
@@ -2170,56 +1959,35 @@
   const int in_h = src_fb->y_crop_height;
   const int out_w = dst_fb->y_crop_width;
   const int out_h = dst_fb->y_crop_height;
-  int x, y;
+  int x, y, i;
 
+  uint8_t *srcs[4] = {src_fb->y_buffer, src_fb->u_buffer, src_fb->v_buffer,
+                      src_fb->alpha_buffer};
+  int src_strides[4] = {src_fb->y_stride, src_fb->uv_stride, src_fb->uv_stride,
+                        src_fb->alpha_stride};
+
+  uint8_t *dsts[4] = {dst_fb->y_buffer, dst_fb->u_buffer, dst_fb->v_buffer,
+                      dst_fb->alpha_buffer};
+  int dst_strides[4] = {dst_fb->y_stride, dst_fb->uv_stride, dst_fb->uv_stride,
+                        dst_fb->alpha_stride};
+
   for (y = 0; y < out_h; y += 16) {
     for (x = 0; x < out_w; x += 16) {
-      int x_q4 = x * 16 * in_w / out_w;
-      int y_q4 = y * 16 * in_h / out_h;
-      uint8_t *src, *dst;
-      int src_stride, dst_stride;
+      for (i = 0; i < MAX_MB_PLANE; ++i) {
+        const int factor = i == 0 ? 1 : 2;
+        const int x_q4 = x * (16 / factor) * in_w / out_w;
+        const int y_q4 = y * (16 / factor) * in_h / out_h;
+        const int src_stride = src_strides[i];
+        const int dst_stride = dst_strides[i];
+        uint8_t *src = srcs[i] + y / factor * in_h / out_h * src_stride +
+                                 x / factor * in_w / out_w;
+        uint8_t *dst = dsts[i] + y * dst_stride + x;
 
-
-      src = src_fb->y_buffer +
-          y * in_h / out_h * src_fb->y_stride +
-          x * in_w / out_w;
-      dst = dst_fb->y_buffer +
-          y * dst_fb->y_stride +
-          x;
-      src_stride = src_fb->y_stride;
-      dst_stride = dst_fb->y_stride;
-
-      vp9_convolve8(src, src_stride, dst, dst_stride,
-                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
-                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
-                    16, 16);
-
-      x_q4 >>= 1;
-      y_q4 >>= 1;
-      src_stride = src_fb->uv_stride;
-      dst_stride = dst_fb->uv_stride;
-
-      src = src_fb->u_buffer +
-          y / 2 * in_h / out_h * src_fb->uv_stride +
-          x / 2 * in_w / out_w;
-      dst = dst_fb->u_buffer +
-          y / 2 * dst_fb->uv_stride +
-          x / 2;
-      vp9_convolve8(src, src_stride, dst, dst_stride,
-                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
-                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
-                    8, 8);
-
-      src = src_fb->v_buffer +
-          y / 2 * in_h / out_h * src_fb->uv_stride +
-          x / 2 * in_w / out_w;
-      dst = dst_fb->v_buffer +
-          y / 2 * dst_fb->uv_stride +
-          x / 2;
-      vp9_convolve8(src, src_stride, dst, dst_stride,
-                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
-                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
-                    8, 8);
+        vp9_convolve8(src, src_stride, dst, dst_stride,
+                      vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
+                      vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
+                      16 / factor, 16 / factor);
+      }
     }
   }
 
@@ -2228,62 +1996,35 @@
 
 
 static void update_alt_ref_frame_stats(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-
-  // Update data structure that monitors level of reference to last GF
-  vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
-  cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
-
   // this frame refreshes means next frames don't unless specified by user
   cpi->common.frames_since_golden = 0;
 
-  // Clear the alternate reference update pending flag.
-  cpi->source_alt_ref_pending = FALSE;
+#if CONFIG_MULTIPLE_ARF
+  if (!cpi->multi_arf_enabled)
+#endif
+    // Clear the alternate reference update pending flag.
+    cpi->source_alt_ref_pending = 0;
 
-  // Set the alternate refernce frame active flag
-  cpi->source_alt_ref_active = TRUE;
-
-
+  // Set the alternate reference frame active flag
+  cpi->source_alt_ref_active = 1;
 }
 static void update_golden_frame_stats(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-
   // Update the Golden frame usage counts.
   if (cpi->refresh_golden_frame) {
-    // Update data structure that monitors level of reference to last GF
-    vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
-    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
-
     // this frame refreshes means next frames don't unless specified by user
     cpi->refresh_golden_frame = 0;
     cpi->common.frames_since_golden = 0;
 
-    // if ( cm->frame_type == KEY_FRAME )
-    // {
-    cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
-    cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
-    cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
-    cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
-    // }
-    // else
-    // {
-    //  // Carry a potrtion of count over to begining of next gf sequence
-    //  cpi->recent_ref_frame_usage[INTRA_FRAME] >>= 5;
-    //  cpi->recent_ref_frame_usage[LAST_FRAME] >>= 5;
-    //  cpi->recent_ref_frame_usage[GOLDEN_FRAME] >>= 5;
-    //  cpi->recent_ref_frame_usage[ALTREF_FRAME] >>= 5;
-    // }
-
     // ******** Fixed Q test code only ************
     // If we are going to use the ALT reference for the next group of frames set a flag to say so.
     if (cpi->oxcf.fixed_q >= 0 &&
         cpi->oxcf.play_alternate && !cpi->refresh_alt_ref_frame) {
-      cpi->source_alt_ref_pending = TRUE;
+      cpi->source_alt_ref_pending = 1;
       cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
     }
 
     if (!cpi->source_alt_ref_pending)
-      cpi->source_alt_ref_active = FALSE;
+      cpi->source_alt_ref_active = 0;
 
     // Decrement count down till next gf
     if (cpi->frames_till_gf_update_due > 0)
@@ -2298,13 +2039,6 @@
       cpi->common.frames_till_alt_ref_frame--;
 
     cpi->common.frames_since_golden++;
-
-    if (cpi->common.frames_since_golden > 1) {
-      cpi->recent_ref_frame_usage[INTRA_FRAME] += cpi->count_mb_ref_frame_usage[INTRA_FRAME];
-      cpi->recent_ref_frame_usage[LAST_FRAME] += cpi->count_mb_ref_frame_usage[LAST_FRAME];
-      cpi->recent_ref_frame_usage[GOLDEN_FRAME] += cpi->count_mb_ref_frame_usage[GOLDEN_FRAME];
-      cpi->recent_ref_frame_usage[ALTREF_FRAME] += cpi->count_mb_ref_frame_usage[ALTREF_FRAME];
-    }
   }
 }
 
@@ -2384,7 +2118,8 @@
       int h = 2 * (prev[0] - next[0]) + (prev[1] - next[1]) + (prev[-1] - next[-1]);
       h = (h < 0 ? -h : h);
       v = (v < 0 ? -v : v);
-      if (h > EDGE_THRESH || v > EDGE_THRESH) num_edge_pels++;
+      if (h > EDGE_THRESH || v > EDGE_THRESH)
+        num_edge_pels++;
       curr++;
       prev++;
       next++;
@@ -2393,7 +2128,7 @@
     prev += frame->y_stride - frame->y_width + 2;
     next += frame->y_stride - frame->y_width + 2;
   }
-  return (double)num_edge_pels / (double)num_pels;
+  return (double)num_edge_pels / num_pels;
 }
 
 // Function to test for conditions that indicate we should loop
@@ -2401,11 +2136,11 @@
 static int recode_loop_test(VP9_COMP *cpi,
                             int high_limit, int low_limit,
                             int q, int maxq, int minq) {
-  int force_recode = FALSE;
+  int force_recode = 0;
   VP9_COMMON *cm = &cpi->common;
 
   // Is frame recode allowed at all
-  // Yes if either recode mode 1 is selected or mode two is selcted
+  // Yes if either recode mode 1 is selected or mode two is selected
   // and the frame is a key frame. golden frame or alt_ref_frame
   if ((cpi->sf.recode_loop == 1) ||
       ((cpi->sf.recode_loop == 2) &&
@@ -2415,21 +2150,19 @@
     // General over and under shoot tests
     if (((cpi->projected_frame_size > high_limit) && (q < maxq)) ||
         ((cpi->projected_frame_size < low_limit) && (q > minq))) {
-      force_recode = TRUE;
+      force_recode = 1;
     }
     // Special Constrained quality tests
     else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
       // Undershoot and below auto cq level
-      if ((q > cpi->cq_target_quality) &&
-          (cpi->projected_frame_size <
-           ((cpi->this_frame_target * 7) >> 3))) {
-        force_recode = TRUE;
-      }
-      // Severe undershoot and between auto and user cq level
-      else if ((q > cpi->oxcf.cq_level) &&
-               (cpi->projected_frame_size < cpi->min_frame_bandwidth) &&
-               (cpi->active_best_quality > cpi->oxcf.cq_level)) {
-        force_recode = TRUE;
+      if (q > cpi->cq_target_quality &&
+          cpi->projected_frame_size < ((cpi->this_frame_target * 7) >> 3)) {
+        force_recode = 1;
+      } else if (q > cpi->oxcf.cq_level &&
+                 cpi->projected_frame_size < cpi->min_frame_bandwidth &&
+                 cpi->active_best_quality > cpi->oxcf.cq_level) {
+        // Severe undershoot and between auto and user cq level
+        force_recode = 1;
         cpi->active_best_quality = cpi->oxcf.cq_level;
       }
     }
@@ -2448,13 +2181,19 @@
                &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
     ref_cnt_fb(cm->fb_idx_ref_cnt,
                &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
-  } else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {
+  }
+#if CONFIG_MULTIPLE_ARF
+  else if (!cpi->multi_arf_enabled && cpi->refresh_golden_frame &&
+      !cpi->refresh_alt_ref_frame) {
+#else
+  else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {
+#endif
     /* Preserve the previously existing golden frame and update the frame in
      * the alt ref slot instead. This is highly specific to the current use of
      * alt-ref as a forward reference, and this needs to be generalized as
      * other uses are implemented (like RTC/temporal scaling)
      *
-     * The update to the buffer in the alt ref slot was signalled in
+     * The update to the buffer in the alt ref slot was signaled in
      * vp9_pack_bitstream(), now swap the buffer pointers so that it's treated
      * as the golden frame next time.
      */
@@ -2466,10 +2205,16 @@
     tmp = cpi->alt_fb_idx;
     cpi->alt_fb_idx = cpi->gld_fb_idx;
     cpi->gld_fb_idx = tmp;
-  } else { /* For non key/golden frames */
+  }  else { /* For non key/golden frames */
     if (cpi->refresh_alt_ref_frame) {
+      int arf_idx = cpi->alt_fb_idx;
+#if CONFIG_MULTIPLE_ARF
+      if (cpi->multi_arf_enabled) {
+        arf_idx = cpi->arf_buffer_idx[cpi->sequence_number + 1];
+      }
+#endif
       ref_cnt_fb(cm->fb_idx_ref_cnt,
-                 &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
+                 &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
     }
 
     if (cpi->refresh_golden_frame) {
@@ -2485,7 +2230,7 @@
 }
 
 static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
-  if (cm->no_lpf || cpi->mb.e_mbd.lossless) {
+  if (cpi->mb.e_mbd.lossless) {
     cm->filter_level = 0;
   } else {
     struct vpx_usec_timer timer;
@@ -2493,11 +2238,9 @@
     vp9_clear_system_state();
 
     vpx_usec_timer_start(&timer);
-    if (cpi->sf.auto_filter == 0)
-      vp9_pick_filter_level_fast(cpi->Source, cpi);
-    else
-      vp9_pick_filter_level(cpi->Source, cpi);
 
+    vp9_pick_filter_level(cpi->Source, cpi);
+
     vpx_usec_timer_mark(&timer);
     cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
   }
@@ -2504,11 +2247,11 @@
 
   if (cm->filter_level > 0) {
     vp9_set_alt_lf_level(cpi, cm->filter_level);
-    vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level, 0,
-                          cm->dering_enabled);
+    vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level, 0);
   }
 
-  vp8_yv12_extend_frame_borders(cm->frame_to_show);
+  vp9_extend_frame_borders(cm->frame_to_show,
+                           cm->subsampling_x, cm->subsampling_y);
 
 }
 
@@ -2551,20 +2294,6 @@
   }
 }
 
-#if CONFIG_COMP_INTERINTRA_PRED
-static void select_interintra_mode(VP9_COMP *cpi) {
-  static const double threshold = 0.01;
-  VP9_COMMON *cm = &cpi->common;
-  // FIXME(debargha): Make this RD based
-  int sum = cpi->interintra_select_count[1] + cpi->interintra_select_count[0];
-  if (sum) {
-    double fraction = (double) cpi->interintra_select_count[1] / sum;
-    // printf("fraction: %f\n", fraction);
-    cm->use_interintra = (fraction > threshold);
-  }
-}
-#endif
-
 static void scale_references(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   int i;
@@ -2576,9 +2305,10 @@
         ref->y_crop_height != cm->height) {
       int new_fb = get_free_fb(cm);
 
-      vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[new_fb],
-                                    cm->width, cm->height,
-                                    VP9BORDERINPIXELS);
+      vp9_realloc_frame_buffer(&cm->yv12_fb[new_fb],
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9BORDERINPIXELS);
       scale_and_extend_frame(ref, &cm->yv12_fb[new_fb]);
       cpi->scaled_ref_idx[i] = new_fb;
     } else {
@@ -2592,9 +2322,8 @@
   VP9_COMMON *cm = &cpi->common;
   int i;
 
-  for (i = 0; i < 3; i++) {
+  for (i = 0; i < 3; i++)
     cm->fb_idx_ref_cnt[cpi->scaled_ref_idx[i]]--;
-  }
 }
 
 static void encode_frame_to_data_rate(VP9_COMP *cpi,
@@ -2603,12 +2332,12 @@
                                       unsigned int *frame_flags) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
-
-  int Q;
+  TX_SIZE t;
+  int q;
   int frame_over_shoot_limit;
   int frame_under_shoot_limit;
 
-  int Loop = FALSE;
+  int loop = 0;
   int loop_count;
 
   int q_low;
@@ -2616,10 +2345,10 @@
 
   int top_index;
   int bottom_index;
-  int active_worst_qchanged = FALSE;
+  int active_worst_qchanged = 0;
 
-  int overshoot_seen = FALSE;
-  int undershoot_seen = FALSE;
+  int overshoot_seen = 0;
+  int undershoot_seen = 0;
 
   SPEED_FEATURES *sf = &cpi->sf;
 #if RESET_FOREACH_FILTER
@@ -2634,11 +2363,7 @@
 
   /* list of filters to search over */
   int mcomp_filters_to_search[] = {
-#if CONFIG_ENABLE_6TAP
-      EIGHTTAP, EIGHTTAP_SHARP, SIXTAP, SWITCHABLE
-#else
-      EIGHTTAP, EIGHTTAP_SHARP, EIGHTTAP_SMOOTH, SWITCHABLE
-#endif
+    EIGHTTAP, EIGHTTAP_SHARP, EIGHTTAP_SMOOTH, SWITCHABLE
   };
   int mcomp_filters = sizeof(mcomp_filters_to_search) /
       sizeof(*mcomp_filters_to_search);
@@ -2646,8 +2371,8 @@
   int64_t mcomp_filter_cost[4];
 
   /* Scale the source buffer, if required */
-  if (cm->mb_cols * 16 != cpi->un_scaled_source->y_width ||
-      cm->mb_rows * 16 != cpi->un_scaled_source->y_height) {
+  if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width ||
+      cm->mi_rows * 8 != cpi->un_scaled_source->y_height) {
     scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source);
     cpi->Source = &cpi->scaled_source;
   } else {
@@ -2663,7 +2388,8 @@
   // For an alt ref frame in 2 pass we skip the call to the second
   // pass function that sets the target bandwidth so must set it here
   if (cpi->refresh_alt_ref_frame) {
-    cpi->per_frame_bandwidth = cpi->twopass.gf_bits;                           // Per frame bit target for the alt ref frame
+    // Per frame bit target for the alt ref frame
+    cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
     // per second target bitrate
     cpi->target_bandwidth = (int)(cpi->twopass.gf_bits *
                                   cpi->output_frame_rate);
@@ -2678,17 +2404,14 @@
   cpi->zbin_mode_boost = 0;
 
   // if (cpi->oxcf.lossless)
-    cpi->zbin_mode_boost_enabled = FALSE;
+    cpi->zbin_mode_boost_enabled = 0;
   // else
-  //   cpi->zbin_mode_boost_enabled = TRUE;
+  //   cpi->zbin_mode_boost_enabled = 1;
 
   // Current default encoder behaviour for the altref sign bias
-  if (cpi->source_alt_ref_active)
-    cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
-  else
-    cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 0;
+    cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = cpi->source_alt_ref_active;
 
-  // Check to see if a key frame is signalled
+  // Check to see if a key frame is signaled
   // For two pass with auto key frame enabled cm->frame_type may already be set, but not for one pass.
   if ((cm->current_video_frame == 0) ||
       (cm->frame_flags & FRAMEFLAGS_KEY) ||
@@ -2715,12 +2438,11 @@
     }
 
     // The alternate reference frame cannot be active for a key frame
-    cpi->source_alt_ref_active = FALSE;
+    cpi->source_alt_ref_active = 0;
 
     // Reset the RD threshold multipliers to default of * 1 (128)
-    for (i = 0; i < MAX_MODES; i++) {
+    for (i = 0; i < MAX_MODES; i++)
       cpi->rd_thresh_mult[i] = 128;
-    }
 
     cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
     cm->frame_parallel_decoding_mode =
@@ -2727,13 +2449,15 @@
       (cpi->oxcf.frame_parallel_decoding_mode != 0);
     if (cm->error_resilient_mode) {
       cm->frame_parallel_decoding_mode = 1;
-      cm->refresh_entropy_probs = 0;
+      cm->reset_frame_context = 0;
+      cm->refresh_frame_context = 0;
     }
   }
 
-  // Configure use of segmentation for enhanced coding of static regions.
+  // Configure experimental use of segmentation for enhanced coding of
+  // static regions if indicated.
   // Only allowed for now in second pass of two pass (as requires lagged coding)
-  // and if the relevent speed feature flag is set.
+  // and if the relevant speed feature flag is set.
   if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) {
     configure_static_seg_features(cpi);
   }
@@ -2744,31 +2468,10 @@
   vp9_clear_system_state();
 
   // Set an active best quality and if necessary active worst quality
-  Q = cpi->active_worst_quality;
+  q = cpi->active_worst_quality;
 
   if (cm->frame_type == KEY_FRAME) {
-    int high = 2000;
-    int low = 400;
-
-    if (cpi->kf_boost > high)
-      cpi->active_best_quality = kf_low_motion_minq[Q];
-    else if (cpi->kf_boost < low)
-      cpi->active_best_quality = kf_high_motion_minq[Q];
-    else {
-      int gap = high - low;
-      int offset = high - cpi->kf_boost;
-      int qdiff = kf_high_motion_minq[Q] - kf_low_motion_minq[Q];
-      int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
-
-      cpi->active_best_quality = kf_low_motion_minq[Q] + adjustment;
-    }
-
-    // Make an adjustment based on the %s static
-    // The main impact of this is at lower Q to prevent overly large key
-    // frames unless a lot of the image is static.
-    if (cpi->kf_zeromotion_pct < 64)
-      cpi->active_best_quality += 4 - (cpi->kf_zeromotion_pct >> 4);
-
+#if !CONFIG_MULTIPLE_ARF
     // Special case for key frames forced because we have reached
     // the maximum key frame interval. Here force the Q to a range
     // based on the ambient Q to reduce the risk of popping
@@ -2775,14 +2478,54 @@
     if (cpi->this_key_frame_forced) {
       int delta_qindex;
       int qindex = cpi->last_boosted_qindex;
+      double last_boosted_q = vp9_convert_qindex_to_q(qindex);
 
-      delta_qindex = compute_qdelta(cpi, qindex,
-                                    (qindex * 0.75));
+      delta_qindex = compute_qdelta(cpi, last_boosted_q,
+                                    (last_boosted_q * 0.75));
 
-      cpi->active_best_quality = qindex + delta_qindex;
-      if (cpi->active_best_quality < cpi->best_quality)
-        cpi->active_best_quality = cpi->best_quality;
+      cpi->active_best_quality = MAX(qindex + delta_qindex, cpi->best_quality);
+    } else {
+      int high = 5000;
+      int low = 400;
+      double q_adj_factor = 1.0;
+      double q_val;
+
+      // Baseline value derived from cpi->active_worst_quality and kf boost
+      if (cpi->kf_boost > high) {
+        cpi->active_best_quality = kf_low_motion_minq[q];
+      } else if (cpi->kf_boost < low) {
+        cpi->active_best_quality = kf_high_motion_minq[q];
+      } else {
+        const int gap = high - low;
+        const int offset = high - cpi->kf_boost;
+        const int qdiff = kf_high_motion_minq[q] - kf_low_motion_minq[q];
+        const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+
+        cpi->active_best_quality = kf_low_motion_minq[q] + adjustment;
+      }
+
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((cm->width * cm->height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
+
+      // Make a further adjustment based on the kf zero motion measure.
+      q_adj_factor += 0.05 - (0.001 * (double)cpi->kf_zeromotion_pct);
+
+      // Convert the adjustment factor to a qindex delta on active_best_quality.
+      q_val = vp9_convert_qindex_to_q(cpi->active_best_quality);
+      cpi->active_best_quality +=
+        compute_qdelta(cpi, q_val, (q_val * q_adj_factor));
     }
+#else
+    double current_q;
+
+    // Force the KF quantizer to be 30% of the active_worst_quality.
+    current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality);
+    cpi->active_best_quality = cpi->active_worst_quality
+        + compute_qdelta(cpi, current_q, current_q * 0.3);
+#endif
   } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
     int high = 2000;
     int low = 400;
@@ -2790,47 +2533,45 @@
     // Use the lower of cpi->active_worst_quality and recent
     // average Q as basis for GF/ARF Q limit unless last frame was
     // a key frame.
-    if ((cpi->frames_since_key > 1) &&
-        (cpi->avg_frame_qindex < cpi->active_worst_quality)) {
-      Q = cpi->avg_frame_qindex;
+    if (cpi->frames_since_key > 1 &&
+        cpi->avg_frame_qindex < cpi->active_worst_quality) {
+      q = cpi->avg_frame_qindex;
     }
 
     // For constrained quality dont allow Q less than the cq level
-    if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
-        (Q < cpi->cq_target_quality)) {
-      Q = cpi->cq_target_quality;
+    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
+        q < cpi->cq_target_quality) {
+      q = cpi->cq_target_quality;
     }
 
-    if (cpi->gfu_boost > high)
-      cpi->active_best_quality = gf_low_motion_minq[Q];
-    else if (cpi->gfu_boost < low)
-      cpi->active_best_quality = gf_high_motion_minq[Q];
-    else {
-      int gap = high - low;
-      int offset = high - cpi->gfu_boost;
-      int qdiff = gf_high_motion_minq[Q] - gf_low_motion_minq[Q];
-      int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+    if (cpi->gfu_boost > high) {
+      cpi->active_best_quality = gf_low_motion_minq[q];
+    } else if (cpi->gfu_boost < low) {
+      cpi->active_best_quality = gf_high_motion_minq[q];
+    } else {
+      const int gap = high - low;
+      const int offset = high - cpi->gfu_boost;
+      const int qdiff = gf_high_motion_minq[q] - gf_low_motion_minq[q];
+      const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
 
-      cpi->active_best_quality = gf_low_motion_minq[Q] + adjustment;
+      cpi->active_best_quality = gf_low_motion_minq[q] + adjustment;
     }
 
     // Constrained quality use slightly lower active best.
-    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-      cpi->active_best_quality =
-        cpi->active_best_quality * 15 / 16;
-    }
+    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+      cpi->active_best_quality = cpi->active_best_quality * 15 / 16;
   } else {
 #ifdef ONE_SHOT_Q_ESTIMATE
 #ifdef STRICT_ONE_SHOT_Q
-    cpi->active_best_quality = Q;
+    cpi->active_best_quality = q;
 #else
-    cpi->active_best_quality = inter_minq[Q];
+    cpi->active_best_quality = inter_minq[q];
 #endif
 #else
-    cpi->active_best_quality = inter_minq[Q];
+    cpi->active_best_quality = inter_minq[q];
 #endif
 
-    // For the constant/constrained quality mode we dont want
+    // For the constant/constrained quality mode we don't want
     // q to fall below the cq level.
     if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
         (cpi->active_best_quality < cpi->cq_target_quality)) {
@@ -2859,22 +2600,45 @@
 
   // Special case code to try and match quality with forced key frames
   if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
-    Q = cpi->last_boosted_qindex;
+    q = cpi->last_boosted_qindex;
   } else {
     // Determine initial Q to try
-    Q = vp9_regulate_q(cpi, cpi->this_frame_target);
+    q = vp9_regulate_q(cpi, cpi->this_frame_target);
   }
 
   vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit,
                                 &frame_over_shoot_limit);
 
-  // Limit Q range for the adaptive loop.
-  bottom_index = cpi->active_best_quality;
-  top_index    = cpi->active_worst_quality;
-  q_low  = cpi->active_best_quality;
-  q_high = cpi->active_worst_quality;
+#if CONFIG_MULTIPLE_ARF
+  // Force the quantizer determined by the coding order pattern.
+  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME)) {
+    double new_q;
+    double current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality);
+    int level = cpi->this_frame_weight;
+    assert(level >= 0);
 
+    // Set quantizer steps at 10% increments.
+    new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level)));
+    q = cpi->active_worst_quality + compute_qdelta(cpi, current_q, new_q);
+
+    bottom_index = q;
+    top_index    = q;
+    q_low  = q;
+    q_high = q;
+
+    printf("frame:%d q:%d\n", cm->current_video_frame, q);
+  } else {
+#endif
+    // Limit Q range for the adaptive loop.
+    bottom_index = cpi->active_best_quality;
+    top_index    = cpi->active_worst_quality;
+    q_low  = cpi->active_best_quality;
+    q_high = cpi->active_worst_quality;
+#if CONFIG_MULTIPLE_ARF
+  }
+#endif
   loop_count = 0;
+  vpx_memset(cpi->rd_tx_select_threshes, 0, sizeof(cpi->rd_tx_select_threshes));
 
   if (cm->frame_type != KEY_FRAME) {
     /* TODO: Decide this more intelligently */
@@ -2885,16 +2649,10 @@
       cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
     }
     /* TODO: Decide this more intelligently */
-    xd->allow_high_precision_mv = (Q < HIGH_PRECISION_MV_QTHRESH);
+    xd->allow_high_precision_mv = q < HIGH_PRECISION_MV_QTHRESH;
     set_mvcost(&cpi->mb);
   }
 
-#if CONFIG_COMP_INTERINTRA_PRED
-  if (cm->current_video_frame == 0) {
-    cm->use_interintra = 1;
-  }
-#endif
-
 #if CONFIG_POSTPROC
 
   if (cpi->oxcf.noise_sensitivity > 0) {
@@ -2919,7 +2677,7 @@
         break;
     }
 
-    vp9_denoise(cpi->Source, cpi->Source, l, 1, 0);
+    vp9_denoise(cpi->Source, cpi->Source, l);
   }
 
 #endif
@@ -2942,66 +2700,23 @@
   do {
     vp9_clear_system_state();  // __asm emms;
 
-    vp9_set_quantizer(cpi, Q);
+    vp9_set_quantizer(cpi, q);
 
     if (loop_count == 0) {
 
-      // setup skip prob for costing in mode/mv decision
-      if (cpi->common.mb_no_coeff_skip) {
-        int k;
-        for (k = 0; k < MBSKIP_CONTEXTS; k++)
-          cm->mbskip_pred_probs[k] = cpi->base_skip_false_prob[Q][k];
-
-        if (cm->frame_type != KEY_FRAME) {
-          if (cpi->refresh_alt_ref_frame) {
-            for (k = 0; k < MBSKIP_CONTEXTS; k++) {
-              if (cpi->last_skip_false_probs[2][k] != 0)
-                cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[2][k];
-            }
-          } else if (cpi->refresh_golden_frame) {
-            for (k = 0; k < MBSKIP_CONTEXTS; k++) {
-              if (cpi->last_skip_false_probs[1][k] != 0)
-                cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[1][k];
-            }
-          } else {
-            int k;
-            for (k = 0; k < MBSKIP_CONTEXTS; k++) {
-              if (cpi->last_skip_false_probs[0][k] != 0)
-                cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[0][k];
-            }
-          }
-
-          // as this is for cost estimate, let's make sure it does not
-          // get extreme either way
-          {
-            int k;
-            for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
-              if (cm->mbskip_pred_probs[k] < 5)
-                cm->mbskip_pred_probs[k] = 5;
-
-              if (cm->mbskip_pred_probs[k] > 250)
-                cm->mbskip_pred_probs[k] = 250;
-
-              if (cpi->is_src_frame_alt_ref)
-                cm->mbskip_pred_probs[k] = 1;
-            }
-          }
-        }
-      }
-
       // Set up entropy depending on frame type.
       if (cm->frame_type == KEY_FRAME) {
         /* Choose which entropy context to use. When using a forward reference
-	 * frame, it immediately follows the keyframe, and thus benefits from
-	 * using the same entropy context established by the keyframe. Otherwise,
-	 * use the default context 0.
-	 */
+         * frame, it immediately follows the keyframe, and thus benefits from
+         * using the same entropy context established by the keyframe.
+         *  Otherwise, use the default context 0.
+         */
         cm->frame_context_idx = cpi->oxcf.play_alternate;
         vp9_setup_key_frame(cpi);
       } else {
-	/* Choose which entropy context to use. Currently there are only two
-	 * contexts used, one for normal frames and one for alt ref frames.
-	 */
+        /* Choose which entropy context to use. Currently there are only two
+         * contexts used, one for normal frames and one for alt ref frames.
+         */
         cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame;
         vp9_setup_inter_frame(cpi);
       }
@@ -3008,16 +2723,12 @@
     }
 
     // transform / motion compensation build reconstruction frame
-#if CONFIG_MODELCOEFPROB && ADJUST_KF_COEF_PROBS
-    if (cm->frame_type == KEY_FRAME)
-      vp9_adjust_default_coef_probs(cm);
-#endif
 
     vp9_encode_frame(cpi);
 
     // Update the skip mb flag probabilities based on the distribution
     // seen in the last encoder iteration.
-    update_base_skip_probs(cpi);
+    // update_base_skip_probs(cpi);
 
     vp9_clear_system_state();  // __asm emms;
 
@@ -3032,61 +2743,55 @@
 
     if (frame_over_shoot_limit == 0)
       frame_over_shoot_limit = 1;
-    active_worst_qchanged = FALSE;
+    active_worst_qchanged = 0;
 
     // Special case handling for forced key frames
     if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
-      int last_q = Q;
+      int last_q = q;
       int kf_err = vp9_calc_ss_err(cpi->Source,
                                    &cm->yv12_fb[cm->new_fb_idx]);
 
       int high_err_target = cpi->ambient_err;
-      int low_err_target = (cpi->ambient_err >> 1);
+      int low_err_target = cpi->ambient_err >> 1;
 
       // Prevent possible divide by zero error below for perfect KF
-      kf_err += (!kf_err);
+      kf_err += !kf_err;
 
       // The key frame is not good enough or we can afford
       // to make it better without undue risk of popping.
-      if (((kf_err > high_err_target) &&
-           (cpi->projected_frame_size <= frame_over_shoot_limit)) ||
-          ((kf_err > low_err_target) &&
-           (cpi->projected_frame_size <= frame_under_shoot_limit))) {
+      if ((kf_err > high_err_target &&
+           cpi->projected_frame_size <= frame_over_shoot_limit) ||
+          (kf_err > low_err_target &&
+           cpi->projected_frame_size <= frame_under_shoot_limit)) {
         // Lower q_high
-        q_high = (Q > q_low) ? (Q - 1) : q_low;
+        q_high = q > q_low ? q - 1 : q_low;
 
         // Adjust Q
-        Q = (Q * high_err_target) / kf_err;
-        if (Q < ((q_high + q_low) >> 1))
-          Q = (q_high + q_low) >> 1;
-      }
-      // The key frame is much better than the previous frame
-      else if ((kf_err < low_err_target) &&
-               (cpi->projected_frame_size >= frame_under_shoot_limit)) {
+        q = (q * high_err_target) / kf_err;
+        q = MIN(q, (q_high + q_low) >> 1);
+      } else if (kf_err < low_err_target &&
+                cpi->projected_frame_size >= frame_under_shoot_limit) {
+        // The key frame is much better than the previous frame
         // Raise q_low
-        q_low = (Q < q_high) ? (Q + 1) : q_high;
+        q_low = q < q_high ? q + 1 : q_high;
 
         // Adjust Q
-        Q = (Q * low_err_target) / kf_err;
-        if (Q > ((q_high + q_low + 1) >> 1))
-          Q = (q_high + q_low + 1) >> 1;
+        q = (q * low_err_target) / kf_err;
+        q = MIN(q, (q_high + q_low + 1) >> 1);
       }
 
       // Clamp Q to upper and lower limits:
-      if (Q > q_high)
-        Q = q_high;
-      else if (Q < q_low)
-        Q = q_low;
+      q = clamp(q, q_low, q_high);
 
-      Loop = ((Q != last_q)) ? TRUE : FALSE;
+      loop = q != last_q;
     }
 
     // Is the projected frame size out of range and are we allowed to attempt to recode.
     else if (recode_loop_test(cpi,
                               frame_over_shoot_limit, frame_under_shoot_limit,
-                              Q, top_index, bottom_index)) {
-      int last_q = Q;
-      int Retries = 0;
+                              q, top_index, bottom_index)) {
+      int last_q = q;
+      int retries = 0;
 
       // Frame size out of permitted range:
       // Update correction factor & compute new Q to try...
@@ -3093,77 +2798,78 @@
 
       // Frame is too large
       if (cpi->projected_frame_size > cpi->this_frame_target) {
-        q_low = (Q < q_high) ? (Q + 1) : q_high; // Raise Qlow as to at least the current value
+        // Raise Qlow as to at least the current value
+        q_low = q < q_high ? q + 1 : q_high;
 
-        if (undershoot_seen || (loop_count > 1)) {
-          // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+        if (undershoot_seen || loop_count > 1) {
+          // Update rate_correction_factor unless cpi->active_worst_quality
+          // has changed.
           if (!active_worst_qchanged)
             vp9_update_rate_correction_factors(cpi, 1);
 
-          Q = (q_high + q_low + 1) / 2;
+          q = (q_high + q_low + 1) / 2;
         } else {
           // Update rate_correction_factor unless cpi->active_worst_quality has changed.
           if (!active_worst_qchanged)
             vp9_update_rate_correction_factors(cpi, 0);
 
-          Q = vp9_regulate_q(cpi, cpi->this_frame_target);
+          q = vp9_regulate_q(cpi, cpi->this_frame_target);
 
-          while ((Q < q_low) && (Retries < 10)) {
+          while (q < q_low && retries < 10) {
             vp9_update_rate_correction_factors(cpi, 0);
-            Q = vp9_regulate_q(cpi, cpi->this_frame_target);
-            Retries++;
+            q = vp9_regulate_q(cpi, cpi->this_frame_target);
+            retries++;
           }
         }
 
-        overshoot_seen = TRUE;
-      }
-      // Frame is too small
-      else {
-        q_high = (Q > q_low) ? (Q - 1) : q_low;
+        overshoot_seen = 1;
+      } else {
+        // Frame is too small
+        q_high = q > q_low ? q - 1 : q_low;
 
-        if (overshoot_seen || (loop_count > 1)) {
+        if (overshoot_seen || loop_count > 1) {
           // Update rate_correction_factor unless cpi->active_worst_quality has changed.
           if (!active_worst_qchanged)
             vp9_update_rate_correction_factors(cpi, 1);
 
-          Q = (q_high + q_low) / 2;
+          q = (q_high + q_low) / 2;
         } else {
           // Update rate_correction_factor unless cpi->active_worst_quality has changed.
           if (!active_worst_qchanged)
             vp9_update_rate_correction_factors(cpi, 0);
 
-          Q = vp9_regulate_q(cpi, cpi->this_frame_target);
+          q = vp9_regulate_q(cpi, cpi->this_frame_target);
 
           // Special case reset for qlow for constrained quality.
           // This should only trigger where there is very substantial
           // undershoot on a frame and the auto cq level is above
           // the user passsed in value.
-          if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
-              (Q < q_low)) {
-            q_low = Q;
+          if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && q < q_low) {
+            q_low = q;
           }
 
-          while ((Q > q_high) && (Retries < 10)) {
+          while (q > q_high && retries < 10) {
             vp9_update_rate_correction_factors(cpi, 0);
-            Q = vp9_regulate_q(cpi, cpi->this_frame_target);
-            Retries++;
+            q = vp9_regulate_q(cpi, cpi->this_frame_target);
+            retries++;
           }
         }
 
-        undershoot_seen = TRUE;
+        undershoot_seen = 1;
       }
 
       // Clamp Q to upper and lower limits:
-      Q = clamp(Q, q_low, q_high);
+      q = clamp(q, q_low, q_high);
 
-      Loop = Q != last_q;
-    } else
-      Loop = FALSE;
+      loop = q != last_q;
+    } else {
+      loop = 0;
+    }
 
     if (cpi->is_src_frame_alt_ref)
-      Loop = FALSE;
+      loop = 0;
 
-    if (Loop == FALSE && cm->frame_type != KEY_FRAME && sf->search_best_filter) {
+    if (!loop && cm->frame_type != KEY_FRAME && sf->search_best_filter) {
       if (mcomp_filter_index < mcomp_filters) {
         int64_t err = vp9_calc_ss_err(cpi->Source,
                                     &cm->yv12_fb[cm->new_fb_idx]);
@@ -3174,7 +2880,7 @@
         if (mcomp_filter_index < mcomp_filters) {
           cm->mcomp_filter_type = mcomp_filters_to_search[mcomp_filter_index];
           loop_count = -1;
-          Loop = TRUE;
+          loop = 1;
         } else {
           int f;
           int64_t best_cost = mcomp_filter_cost[0];
@@ -3187,7 +2893,7 @@
           }
           if (mcomp_best_filter != mcomp_filters_to_search[mcomp_filters - 1]) {
             loop_count = -1;
-            Loop = TRUE;
+            loop = 1;
             cm->mcomp_filter_type = mcomp_best_filter;
           }
           /*
@@ -3197,12 +2903,12 @@
           */
         }
 #if RESET_FOREACH_FILTER
-        if (Loop == TRUE) {
-          overshoot_seen = FALSE;
-          undershoot_seen = FALSE;
+        if (loop) {
+          overshoot_seen = 0;
+          undershoot_seen = 0;
           q_low = q_low0;
           q_high = q_high0;
-          Q = Q0;
+          q = Q0;
           cpi->rate_correction_factor = rate_correction_factor0;
           cpi->gf_rate_correction_factor = gf_rate_correction_factor0;
           cpi->active_best_quality = active_best_quality0;
@@ -3212,7 +2918,7 @@
       }
     }
 
-    if (Loop == TRUE) {
+    if (loop) {
       loop_count++;
 
 #if CONFIG_INTERNAL_STATS
@@ -3219,7 +2925,7 @@
       cpi->tot_recode_hits++;
 #endif
     }
-  } while (Loop == TRUE);
+  } while (loop);
 
   // Special case code to reduce pulsing when key frames are forced at a
   // fixed interval. Note the reconstruction error if it is the frame before
@@ -3229,51 +2935,9 @@
                                        &cm->yv12_fb[cm->new_fb_idx]);
   }
 
-  // This frame's MVs are saved and will be used in next frame's MV
-  // prediction. Last frame has one more line(add to bottom) and one
-  // more column(add to right) than cm->mip. The edge elements are
-  // initialized to 0.
-  if (cm->show_frame) { // do not save for altref frame
-    int mb_row;
-    int mb_col;
-    MODE_INFO *tmp = cm->mip;
-
-    if (cm->frame_type != KEY_FRAME) {
-      for (mb_row = 0; mb_row < cm->mb_rows + 1; mb_row ++) {
-        for (mb_col = 0; mb_col < cm->mb_cols + 1; mb_col ++) {
-          if (tmp->mbmi.ref_frame != INTRA_FRAME)
-            cpi->lfmv[mb_col + mb_row * (cm->mode_info_stride + 1)].as_int = tmp->mbmi.mv[0].as_int;
-
-          cpi->lf_ref_frame_sign_bias[mb_col + mb_row * (cm->mode_info_stride + 1)] = cm->ref_frame_sign_bias[tmp->mbmi.ref_frame];
-          cpi->lf_ref_frame[mb_col + mb_row * (cm->mode_info_stride + 1)] = tmp->mbmi.ref_frame;
-          tmp++;
-        }
-      }
-    }
-  }
-
-  // Update the GF useage maps.
-  // This is done after completing the compression of a frame when all modes
-  // etc. are finalized but before loop filter
-  vp9_update_gf_useage_maps(cpi, cm, &cpi->mb);
-
   if (cm->frame_type == KEY_FRAME)
     cpi->refresh_last_frame = 1;
 
-#if 0
-  {
-    FILE *f = fopen("gfactive.stt", "a");
-    fprintf(f, "%8d %8d %8d %8d %8d\n",
-            cm->current_video_frame,
-            (100 * cpi->gf_active_count)
-              / (cpi->common.mb_rows * cpi->common.mb_cols),
-            cpi->this_iiratio,
-            cpi->next_iiratio,
-            cpi->refresh_golden_frame);
-    fclose(f);
-  }
-#endif
-
   cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
 
 #if WRITE_RECON_BUFFER
@@ -3288,38 +2952,42 @@
   // Pick the loop filter level for the frame.
   loopfilter_frame(cpi, cm);
 
+#if WRITE_RECON_BUFFER
+  if (cm->show_frame)
+    write_cx_frame_to_file(cm->frame_to_show,
+                           cm->current_video_frame + 2000);
+  else
+    write_cx_frame_to_file(cm->frame_to_show,
+                           cm->current_video_frame + 3000);
+#endif
+
   // build the bitstream
   cpi->dummy_packing = 0;
   vp9_pack_bitstream(cpi, dest, size);
 
-  if (cpi->mb.e_mbd.update_mb_segmentation_map) {
+  if (xd->update_mb_segmentation_map) {
     update_reference_segmentation_map(cpi);
   }
 
   release_scaled_references(cpi);
   update_reference_frames(cpi);
-  vp9_copy(cpi->common.fc.coef_counts_4x4, cpi->coef_counts_4x4);
-  vp9_copy(cpi->common.fc.coef_counts_8x8, cpi->coef_counts_8x8);
-  vp9_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16);
-  vp9_copy(cpi->common.fc.coef_counts_32x32, cpi->coef_counts_32x32);
+
+  for (t = TX_4X4; t <= TX_32X32; t++)
+    vp9_full_to_model_counts(cpi->common.fc.coef_counts[t],
+                             cpi->coef_counts[t]);
   if (!cpi->common.error_resilient_mode &&
       !cpi->common.frame_parallel_decoding_mode) {
     vp9_adapt_coef_probs(&cpi->common);
-#if CONFIG_CODE_NONZEROCOUNT
-    vp9_adapt_nzc_probs(&cpi->common);
-#endif
   }
+
   if (cpi->common.frame_type != KEY_FRAME) {
-    vp9_copy(cpi->common.fc.sb_ymode_counts, cpi->sb_ymode_count);
-    vp9_copy(cpi->common.fc.ymode_counts, cpi->ymode_count);
+    vp9_copy(cpi->common.fc.y_mode_counts, cpi->y_mode_count);
     vp9_copy(cpi->common.fc.uv_mode_counts, cpi->y_uv_mode_count);
-    vp9_copy(cpi->common.fc.bmode_counts, cpi->bmode_count);
-    vp9_copy(cpi->common.fc.i8x8_mode_counts, cpi->i8x8_mode_count);
-    vp9_copy(cpi->common.fc.sub_mv_ref_counts, cpi->sub_mv_ref_count);
-    vp9_copy(cpi->common.fc.mbsplit_counts, cpi->mbsplit_count);
-#if CONFIG_COMP_INTERINTRA_PRED
-    vp9_copy(cpi->common.fc.interintra_counts, cpi->interintra_count);
-#endif
+    vp9_copy(cpi->common.fc.partition_counts, cpi->partition_count);
+    vp9_copy(cm->fc.intra_inter_count, cpi->intra_inter_count);
+    vp9_copy(cm->fc.comp_inter_count, cpi->comp_inter_count);
+    vp9_copy(cm->fc.single_ref_count, cpi->single_ref_count);
+    vp9_copy(cm->fc.comp_ref_count, cpi->comp_ref_count);
     cpi->common.fc.NMVcount = cpi->NMVcount;
     if (!cpi->common.error_resilient_mode &&
         !cpi->common.frame_parallel_decoding_mode) {
@@ -3328,9 +2996,9 @@
       vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);
     }
   }
-#if CONFIG_COMP_INTERINTRA_PRED
-  if (cm->frame_type != KEY_FRAME)
-    select_interintra_mode(cpi);
+
+#ifdef ENTROPY_STATS
+  vp9_update_mode_context_stats(cpi);
 #endif
 
   /* Move storing frame_type out of the above loop since it is also
@@ -3368,16 +3036,16 @@
     cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;
 
   // Keep a record from which we can calculate the average Q excluding GF updates and key frames
-  if ((cm->frame_type != KEY_FRAME)
-      && !cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {
+  if (cm->frame_type != KEY_FRAME &&
+      !cpi->refresh_golden_frame &&
+      !cpi->refresh_alt_ref_frame) {
     cpi->ni_frames++;
-    cpi->tot_q += vp9_convert_qindex_to_q(Q);
+    cpi->tot_q += vp9_convert_qindex_to_q(q);
     cpi->avg_q = cpi->tot_q / (double)cpi->ni_frames;
 
-    // Calculate the average Q for normal inter frames (not key or GFU
-    // frames).
-    cpi->ni_tot_qi += Q;
-    cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames);
+    // Calculate the average Q for normal inter frames (not key or GFU frames).
+    cpi->ni_tot_qi += q;
+    cpi->ni_av_qi = cpi->ni_tot_qi / cpi->ni_frames;
   }
 
   // Update the buffer level variable.
@@ -3406,7 +3074,7 @@
   }
 
   // Actual bits spent
-  cpi->total_actual_bits    += cpi->projected_frame_size;
+  cpi->total_actual_bits += cpi->projected_frame_size;
 
   // Debug stats
   cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);
@@ -3417,20 +3085,18 @@
   if (cm->frame_type == KEY_FRAME) {
     cpi->twopass.kf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
 
-    if (cpi->twopass.kf_group_bits < 0)
-      cpi->twopass.kf_group_bits = 0;
+    cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);
   } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
     cpi->twopass.gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
 
-    if (cpi->twopass.gf_group_bits < 0)
-      cpi->twopass.gf_group_bits = 0;
+    cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0);
   }
 
   // Update the skip mb flag probabilities based on the distribution seen
   // in this frame.
-  update_base_skip_probs(cpi);
+  // update_base_skip_probs(cpi);
 
-#if 0  // 1 && CONFIG_INTERNAL_STATS
+#if 0 && CONFIG_INTERNAL_STATS
   {
     FILE *f = fopen("tmp.stt", "a");
     int recon_err;
@@ -3440,7 +3106,7 @@
     recon_err = vp9_calc_ss_err(cpi->Source,
                                 &cm->yv12_fb[cm->new_fb_idx]);
 
-    if (cpi->twopass.total_left_stats->coded_error != 0.0)
+    if (cpi->twopass.total_left_stats.coded_error != 0.0)
       fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"
               "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
               "%6d %6d %5d %5d %5d %8.2f %10d %10.3f"
@@ -3463,9 +3129,9 @@
               cm->frame_type, cpi->gfu_boost,
               cpi->twopass.est_max_qcorrection_factor,
               (int)cpi->twopass.bits_left,
-              cpi->twopass.total_left_stats->coded_error,
+              cpi->twopass.total_left_stats.coded_error,
               (double)cpi->twopass.bits_left /
-              cpi->twopass.total_left_stats->coded_error,
+              cpi->twopass.total_left_stats.coded_error,
               cpi->tot_recode_hits, recon_err, cpi->kf_boost,
               cpi->kf_zeromotion_pct);
     else
@@ -3492,7 +3158,7 @@
               cm->frame_type, cpi->gfu_boost,
               cpi->twopass.est_max_qcorrection_factor,
               (int)cpi->twopass.bits_left,
-              cpi->twopass.total_left_stats->coded_error,
+              cpi->twopass.total_left_stats.coded_error,
               cpi->tot_recode_hits, recon_err, cpi->kf_boost,
               cpi->kf_zeromotion_pct);
 
@@ -3577,10 +3243,33 @@
     // Tell the caller that the frame was coded as a key frame
     *frame_flags = cm->frame_flags | FRAMEFLAGS_KEY;
 
-    // As this frame is a key frame  the next defaults to an inter frame.
+#if CONFIG_MULTIPLE_ARF
+    // Reset the sequence number.
+    if (cpi->multi_arf_enabled) {
+      cpi->sequence_number = 0;
+      cpi->frame_coding_order_period = cpi->new_frame_coding_order_period;
+      cpi->new_frame_coding_order_period = -1;
+    }
+#endif
+
+    // As this frame is a key frame the next defaults to an inter frame.
     cm->frame_type = INTER_FRAME;
   } else {
     *frame_flags = cm->frame_flags&~FRAMEFLAGS_KEY;
+
+#if CONFIG_MULTIPLE_ARF
+    /* Increment position in the coded frame sequence. */
+    if (cpi->multi_arf_enabled) {
+      ++cpi->sequence_number;
+      if (cpi->sequence_number >= cpi->frame_coding_order_period) {
+        cpi->sequence_number = 0;
+        cpi->frame_coding_order_period = cpi->new_frame_coding_order_period;
+        cpi->new_frame_coding_order_period = -1;
+      }
+      cpi->this_frame_weight = cpi->arf_weight[cpi->sequence_number];
+      assert(cpi->this_frame_weight >= 0);
+    }
+#endif
   }
 
   // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas.
@@ -3592,16 +3281,16 @@
   cm->last_width = cm->width;
   cm->last_height = cm->height;
 
-  // Dont increment frame counters if this was an altref buffer update not a real frame
+  // Don't increment frame counters if this was an altref buffer
+  // update not a real frame
+  cm->last_show_frame = cm->show_frame;
   if (cm->show_frame) {
-    cm->current_video_frame++;
-    cpi->frames_since_key++;
+    ++cm->current_video_frame;
+    ++cpi->frames_since_key;
   }
 
   // reset to normal state now that we are done.
 
-
-
 #if 0
   {
     char filename[512];
@@ -3620,11 +3309,15 @@
 
   if (cm->show_frame) {
     vpx_memcpy(cm->prev_mip, cm->mip,
-               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
+               cm->mode_info_stride * (cm->mi_rows + 64 / MI_SIZE) *
+               sizeof(MODE_INFO));
   } else {
     vpx_memset(cm->prev_mip, 0,
-               (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));
+               cm->mode_info_stride * (cm->mi_rows + 64 / MI_SIZE) *
+               sizeof(MODE_INFO));
   }
+  // restore prev_mi
+  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
 }
 
 static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
@@ -3662,6 +3355,15 @@
   struct vpx_usec_timer  timer;
   int                    res = 0;
 
+  if (!cpi->initial_width) {
+    // TODO(jkoleszar): Support 1/4 subsampling?
+    cm->subsampling_x = sd->uv_width < sd->y_width;
+    cm->subsampling_y = sd->uv_height < sd->y_height;
+    alloc_raw_frame_buffers(cpi);
+
+    cpi->initial_width = cm->width;
+    cpi->initial_height = cm->height;
+  }
   vpx_usec_timer_start(&timer);
   if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags,
                          cpi->active_map_enabled ? cpi->active_map : NULL))
@@ -3676,15 +3378,24 @@
 
 static int frame_is_reference(const VP9_COMP *cpi) {
   const VP9_COMMON *cm = &cpi->common;
-  const MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  const MACROBLOCKD *mb = &cpi->mb.e_mbd;
 
-  return cm->frame_type == KEY_FRAME || cpi->refresh_last_frame
-         || cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame
-         || cm->refresh_entropy_probs
-         || xd->mode_ref_lf_delta_update
-         || xd->update_mb_segmentation_map || xd->update_mb_segmentation_data;
+  return cm->frame_type == KEY_FRAME ||
+         cpi->refresh_last_frame ||
+         cpi->refresh_golden_frame ||
+         cpi->refresh_alt_ref_frame ||
+         cm->refresh_frame_context ||
+         mb->mode_ref_lf_delta_update ||
+         mb->update_mb_segmentation_map ||
+         mb->update_mb_segmentation_data;
 }
 
+#if CONFIG_MULTIPLE_ARF
+int is_next_frame_arf(VP9_COMP *cpi) {
+  // Negative entry in frame_coding_order indicates an ARF at this position.
+  return cpi->frame_coding_order[cpi->sequence_number + 1] < 0 ? 1 : 0;
+}
+#endif
 
 int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
                             unsigned long *size, unsigned char *dest,
@@ -3693,6 +3404,8 @@
   VP9_COMMON *cm = &cpi->common;
   struct vpx_usec_timer  cmptimer;
   YV12_BUFFER_CONFIG    *force_src_buffer = NULL;
+  int i;
+  // FILE *fp_out = fopen("enc_frame_type.txt", "a");
 
   if (!cpi)
     return -1;
@@ -3704,46 +3417,117 @@
   cpi->mb.e_mbd.allow_high_precision_mv = ALTREF_HIGH_PRECISION_MV;
   set_mvcost(&cpi->mb);
 
-  // Should we code an alternate reference frame
-  if (cpi->oxcf.play_alternate &&
-      cpi->source_alt_ref_pending) {
-    if ((cpi->source = vp9_lookahead_peek(cpi->lookahead,
-                                          cpi->frames_till_gf_update_due))) {
+  // Should we code an alternate reference frame.
+  if (cpi->oxcf.play_alternate && cpi->source_alt_ref_pending) {
+    int frames_to_arf;
+
+#if CONFIG_MULTIPLE_ARF
+    assert(!cpi->multi_arf_enabled ||
+           cpi->frame_coding_order[cpi->sequence_number] < 0);
+
+    if (cpi->multi_arf_enabled && (cpi->pass == 2))
+      frames_to_arf = (-cpi->frame_coding_order[cpi->sequence_number])
+        - cpi->next_frame_in_order;
+    else
+#endif
+      frames_to_arf = cpi->frames_till_gf_update_due;
+
+    assert(frames_to_arf < cpi->twopass.frames_to_key);
+
+    if ((cpi->source = vp9_lookahead_peek(cpi->lookahead, frames_to_arf))) {
+#if CONFIG_MULTIPLE_ARF
+      cpi->alt_ref_source[cpi->arf_buffered] = cpi->source;
+#else
       cpi->alt_ref_source = cpi->source;
+#endif
+
       if (cpi->oxcf.arnr_max_frames > 0) {
-        vp9_temporal_filter_prepare(cpi, cpi->frames_till_gf_update_due);
+        // Produce the filtered ARF frame.
+        // TODO(agrange) merge these two functions.
+        configure_arnr_filter(cpi, cm->current_video_frame + frames_to_arf,
+                              cpi->gfu_boost);
+        vp9_temporal_filter_prepare(cpi, frames_to_arf);
         force_src_buffer = &cpi->alt_ref_buffer;
       }
-      cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
+
+      cm->show_frame = 0;
+      cm->intra_only = 0;
       cpi->refresh_alt_ref_frame = 1;
       cpi->refresh_golden_frame = 0;
       cpi->refresh_last_frame = 0;
-      cm->show_frame = 0;
-      cpi->source_alt_ref_pending = FALSE;   // Clear Pending altf Ref flag.
       cpi->is_src_frame_alt_ref = 0;
+
+      // TODO(agrange) This needs to vary depending on where the next ARF is.
+      cm->frames_till_alt_ref_frame = frames_to_arf;
+
+#if CONFIG_MULTIPLE_ARF
+      if (!cpi->multi_arf_enabled)
+#endif
+        cpi->source_alt_ref_pending = 0;   // Clear Pending altf Ref flag.
     }
   }
 
   if (!cpi->source) {
+#if CONFIG_MULTIPLE_ARF
+    int i;
+#endif
     if ((cpi->source = vp9_lookahead_pop(cpi->lookahead, flush))) {
       cm->show_frame = 1;
 
+#if CONFIG_MULTIPLE_ARF
+      // Is this frame the ARF overlay.
+      cpi->is_src_frame_alt_ref = 0;
+      for (i = 0; i < cpi->arf_buffered; ++i) {
+        if (cpi->source == cpi->alt_ref_source[i]) {
+          cpi->is_src_frame_alt_ref = 1;
+          cpi->refresh_golden_frame = 1;
+          break;
+        }
+      }
+#else
       cpi->is_src_frame_alt_ref = cpi->alt_ref_source
                                   && (cpi->source == cpi->alt_ref_source);
-
+#endif
       if (cpi->is_src_frame_alt_ref) {
-        cpi->refresh_last_frame = 0;
+        // Current frame is an ARF overlay frame.
+#if CONFIG_MULTIPLE_ARF
+        cpi->alt_ref_source[i] = NULL;
+#else
         cpi->alt_ref_source = NULL;
+#endif
+        // Don't refresh the last buffer for an ARF overlay frame. It will
+        // become the GF so preserve last as an alternative prediction option.
+        cpi->refresh_last_frame = 0;
       }
+#if CONFIG_MULTIPLE_ARF
+      ++cpi->next_frame_in_order;
+#endif
     }
   }
 
   if (cpi->source) {
-    cpi->un_scaled_source =
-      cpi->Source = force_src_buffer ? force_src_buffer : &cpi->source->img;
+    cpi->un_scaled_source = cpi->Source = force_src_buffer ? force_src_buffer
+                                                           : &cpi->source->img;
     *time_stamp = cpi->source->ts_start;
     *time_end = cpi->source->ts_end;
     *frame_flags = cpi->source->flags;
+
+    // fprintf(fp_out, "   Frame:%d", cm->current_video_frame);
+#if CONFIG_MULTIPLE_ARF
+    if (cpi->multi_arf_enabled) {
+      // fprintf(fp_out, "   seq_no:%d  this_frame_weight:%d",
+      //         cpi->sequence_number, cpi->this_frame_weight);
+    } else {
+      // fprintf(fp_out, "\n");
+    }
+#else
+    // fprintf(fp_out, "\n");
+#endif
+
+#if CONFIG_MULTIPLE_ARF
+    if ((cm->frame_type != KEY_FRAME) && (cpi->pass == 2))
+      cpi->source_alt_ref_pending = is_next_frame_arf(cpi);
+#endif
   } else {
     *size = 0;
     if (flush && cpi->pass == 1 && !cpi->twopass.first_pass_done) {
@@ -3751,6 +3535,7 @@
       cpi->twopass.first_pass_done = 1;
     }
 
+    // fclose(fp_out);
     return -1;
   }
 
@@ -3768,11 +3553,11 @@
       this_duration = cpi->source->ts_end - cpi->source->ts_start;
       step = 1;
     } else {
-      int64_t last_duration;
+      int64_t last_duration = cpi->last_end_time_stamp_seen
+                                - cpi->last_time_stamp_seen;
 
       this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
-      last_duration = cpi->last_end_time_stamp_seen
-                      - cpi->last_time_stamp_seen;
+
       // do a step update if the duration changes by 10%
       if (last_duration)
         step = (int)((this_duration - last_duration) * 10 / last_duration);
@@ -3779,21 +3564,15 @@
     }
 
     if (this_duration) {
-      if (step)
+      if (step) {
         vp9_new_frame_rate(cpi, 10000000.0 / this_duration);
-      else {
-        double avg_duration, interval;
-
-        /* Average this frame's rate into the last second's average
-         * frame rate. If we haven't seen 1 second yet, then average
-         * over the whole interval seen.
-         */
-        interval = (double)(cpi->source->ts_end
-                            - cpi->first_time_stamp_ever);
-        if (interval > 10000000.0)
-          interval = 10000000;
-
-        avg_duration = 10000000.0 / cpi->oxcf.frame_rate;
+      } else {
+        // Average this frame's rate into the last second's average
+        // frame rate. If we haven't seen 1 second yet, then average
+        // over the whole interval seen.
+        const double interval = MIN((double)(cpi->source->ts_end
+                                     - cpi->first_time_stamp_ever), 10000000.0);
+        double avg_duration = 10000000.0 / cpi->oxcf.frame_rate;
         avg_duration *= (interval - avg_duration + this_duration);
         avg_duration /= interval;
 
@@ -3811,22 +3590,6 @@
   // Clear down mmx registers
   vp9_clear_system_state();  // __asm emms;
 
-  cm->frame_type = INTER_FRAME;
-  cm->frame_flags = *frame_flags;
-
-#if 0
-
-  if (cpi->refresh_alt_ref_frame) {
-    // cpi->refresh_golden_frame = 1;
-    cpi->refresh_golden_frame = 0;
-    cpi->refresh_last_frame = 0;
-  } else {
-    cpi->refresh_golden_frame = 0;
-    cpi->refresh_last_frame = 1;
-  }
-
-#endif
-
   /* find a free buffer for the new frame, releasing the reference previously
    * held.
    */
@@ -3833,17 +3596,50 @@
   cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
   cm->new_fb_idx = get_free_fb(cm);
 
+#if CONFIG_MULTIPLE_ARF
+  /* Set up the correct ARF frame. */
+  if (cpi->refresh_alt_ref_frame) {
+    ++cpi->arf_buffered;
+  }
+  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) &&
+      (cpi->pass == 2)) {
+    cpi->alt_fb_idx = cpi->arf_buffer_idx[cpi->sequence_number];
+  }
+#endif
+
   /* Get the mapping of L/G/A to the reference buffer pool */
   cm->active_ref_idx[0] = cm->ref_frame_map[cpi->lst_fb_idx];
   cm->active_ref_idx[1] = cm->ref_frame_map[cpi->gld_fb_idx];
   cm->active_ref_idx[2] = cm->ref_frame_map[cpi->alt_fb_idx];
 
-  /* Reset the frame pointers to the current frame size */
-  vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx],
-                                cm->width, cm->height,
-                                VP9BORDERINPIXELS);
+#if 0  // CONFIG_MULTIPLE_ARF
+  if (cpi->multi_arf_enabled) {
+    fprintf(fp_out, "      idx(%d, %d, %d, %d) active(%d, %d, %d)",
+        cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx, cm->new_fb_idx,
+        cm->active_ref_idx[0], cm->active_ref_idx[1], cm->active_ref_idx[2]);
+    if (cpi->refresh_alt_ref_frame)
+      fprintf(fp_out, "  type:ARF");
+    if (cpi->is_src_frame_alt_ref)
+      fprintf(fp_out, "  type:OVERLAY[%d]", cpi->alt_fb_idx);
+    fprintf(fp_out, "\n");
+  }
+#endif
 
+  cm->frame_type = INTER_FRAME;
+  cm->frame_flags = *frame_flags;
+
+  // Reset the frame pointers to the current frame size
+  vp9_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx],
+                           cm->width, cm->height,
+                           cm->subsampling_x, cm->subsampling_y,
+                           VP9BORDERINPIXELS);
+
+  // Calculate scaling factors for each of the 3 available references
+  for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
+    vp9_setup_scale_factors(cm, i);
+
   vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);
+
   if (cpi->pass == 1) {
     Pass1Encode(cpi, size, dest, frame_flags);
   } else if (cpi->pass == 2) {
@@ -3852,10 +3648,8 @@
     encode_frame_to_data_rate(cpi, size, dest, frame_flags);
   }
 
-  if (cm->refresh_entropy_probs) {
-    vpx_memcpy(&cm->frame_contexts[cm->frame_context_idx], &cm->fc,
-               sizeof(cm->fc));
-  }
+  if (cm->refresh_frame_context)
+    cm->frame_contexts[cm->frame_context_idx] = cm->fc;
 
   if (*size > 0) {
     // if its a dropped frame honor the requests on subsequent frames
@@ -3862,20 +3656,19 @@
     cpi->droppable = !frame_is_reference(cpi);
 
     // return to normal state
-    cm->refresh_entropy_probs = 1;
+    cm->reset_frame_context = 0;
+    cm->refresh_frame_context = 1;
     cpi->refresh_alt_ref_frame = 0;
     cpi->refresh_golden_frame = 0;
     cpi->refresh_last_frame = 1;
     cm->frame_type = INTER_FRAME;
-
   }
 
   vpx_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
 
-  if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) {
+  if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame)
     generate_psnr_packet(cpi);
-  }
 
 #if CONFIG_INTERNAL_STATS
 
@@ -3923,7 +3716,7 @@
           double weight = 0;
 #if CONFIG_POSTPROC
           vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer,
-                      cm->filter_level * 10 / 6, 1, 0);
+                      cm->filter_level * 10 / 6);
 #endif
           vp9_clear_system_state();
 
@@ -3950,10 +3743,16 @@
           cpi->totalp  += frame_psnr2;
 
           frame_ssim2 = vp9_calc_ssim(cpi->Source,
-                                      &cm->post_proc_buffer, 1, &weight);
+                                      recon, 1, &weight);
 
           cpi->summed_quality += frame_ssim2 * weight;
           cpi->summed_weights += weight;
+
+          frame_ssim2 = vp9_calc_ssim(cpi->Source,
+                                      &cm->post_proc_buffer, 1, &weight);
+
+          cpi->summedp_quality += frame_ssim2 * weight;
+          cpi->summedp_weights += weight;
 #if 0
           {
             FILE *f = fopen("q_used.stt", "a");
@@ -3975,12 +3774,11 @@
         cpi->total_ssimg_v += v;
         cpi->total_ssimg_all += frame_all;
       }
-
     }
   }
 
 #endif
-
+  // fclose(fp_out);
   return 0;
 }
 
@@ -4013,8 +3811,9 @@
 }
 
 int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
-                   unsigned int cols, int delta_q[4], int delta_lf[4],
-                   unsigned int threshold[4]) {
+                   unsigned int cols, int delta_q[MAX_MB_SEGMENTS],
+                   int delta_lf[MAX_MB_SEGMENTS],
+                   unsigned int threshold[MAX_MB_SEGMENTS]) {
   VP9_COMP *cpi = (VP9_COMP *) comp;
   signed char feature_data[SEG_LVL_MAX][MAX_MB_SEGMENTS];
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
@@ -4034,25 +3833,15 @@
   // Activate segmentation.
   vp9_enable_segmentation((VP9_PTR)cpi);
 
-  // Set up the quant segment data
-  feature_data[SEG_LVL_ALT_Q][0] = delta_q[0];
-  feature_data[SEG_LVL_ALT_Q][1] = delta_q[1];
-  feature_data[SEG_LVL_ALT_Q][2] = delta_q[2];
-  feature_data[SEG_LVL_ALT_Q][3] = delta_q[3];
+  // Set up the quan, LF and breakout threshold segment data
+  for (i = 0; i < MAX_MB_SEGMENTS; i++) {
+    feature_data[SEG_LVL_ALT_Q][i] = delta_q[i];
+    feature_data[SEG_LVL_ALT_LF][i] = delta_lf[i];
+    cpi->segment_encode_breakout[i] = threshold[i];
+  }
 
-  // Set up the loop segment data s
-  feature_data[SEG_LVL_ALT_LF][0] = delta_lf[0];
-  feature_data[SEG_LVL_ALT_LF][1] = delta_lf[1];
-  feature_data[SEG_LVL_ALT_LF][2] = delta_lf[2];
-  feature_data[SEG_LVL_ALT_LF][3] = delta_lf[3];
-
-  cpi->segment_encode_breakout[0] = threshold[0];
-  cpi->segment_encode_breakout[1] = threshold[1];
-  cpi->segment_encode_breakout[2] = threshold[2];
-  cpi->segment_encode_breakout[3] = threshold[3];
-
   // Enable the loop and quant changes in the feature mask
-  for (i = 0; i < 4; i++) {
+  for (i = 0; i < MAX_MB_SEGMENTS; i++) {
     if (delta_q[i])
       vp9_enable_segfeature(xd, i, SEG_LVL_ALT_Q);
     else
@@ -4079,8 +3868,9 @@
     if (map) {
       vpx_memcpy(cpi->active_map, map, rows * cols);
       cpi->active_map_enabled = 1;
-    } else
+    } else {
       cpi->active_map_enabled = 0;
+    }
 
     return 0;
   } else {
@@ -4095,12 +3885,9 @@
   VP9_COMMON *cm = &cpi->common;
   int hr = 0, hs = 0, vr = 0, vs = 0;
 
-  if (horiz_mode > ONETWO)
+  if (horiz_mode > ONETWO || vert_mode > ONETWO)
     return -1;
 
-  if (vert_mode > ONETWO)
-    return -1;
-
   Scale2Ratio(horiz_mode, &hr, &hs);
   Scale2Ratio(vert_mode, &vr, &vs);
 
@@ -4141,6 +3928,5 @@
 
 
 int vp9_get_quantizer(VP9_PTR c) {
-  VP9_COMP   *cpi = (VP9_COMP *) c;
-  return cpi->common.base_qindex;
+  return ((VP9_COMP *)c)->common.base_qindex;
 }
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -30,24 +30,25 @@
 #include "vp9/encoder/vp9_lookahead.h"
 
 // Experimental rate control switches
-// #define ONE_SHOT_Q_ESTIMATE 1
-// #define STRICT_ONE_SHOT_Q 1
-// #define DISABLE_RC_LONG_TERM_MEM 1
+#if CONFIG_ONESHOTQ
+#define ONE_SHOT_Q_ESTIMATE 0
+#define STRICT_ONE_SHOT_Q 0
+#define DISABLE_RC_LONG_TERM_MEM 0
+#endif
 
 // #define SPEEDSTATS 1
+#if CONFIG_MULTIPLE_ARF
+// Set MIN_GF_INTERVAL to 1 for the full decomposition.
+#define MIN_GF_INTERVAL             2
+#else
 #define MIN_GF_INTERVAL             4
+#endif
 #define DEFAULT_GF_INTERVAL         7
 
 #define KEY_FRAME_CONTEXT 5
 
-#define MAX_LAG_BUFFERS 25
+#define MAX_MODES 36
 
-#if CONFIG_COMP_INTERINTRA_PRED
-#define MAX_MODES 54
-#else
-#define MAX_MODES 42
-#endif
-
 #define MIN_THRESHMULT  32
 #define MAX_THRESHMULT  512
 
@@ -63,63 +64,35 @@
   int nmvcosts[2][MV_VALS];
   int nmvcosts_hp[2][MV_VALS];
 
-#ifdef MODE_STATS
-  // Stats
-  int y_modes[VP9_YMODES];
-  int uv_modes[VP9_UV_MODES];
-  int i8x8_modes[VP9_I8X8_MODES];
-  int b_modes[B_MODE_COUNT];
-  int inter_y_modes[MB_MODE_COUNT];
-  int inter_uv_modes[VP9_UV_MODES];
-  int inter_b_modes[B_MODE_COUNT];
-#endif
-
   vp9_prob segment_pred_probs[PREDICTION_PROBS];
-  unsigned char ref_pred_probs_update[PREDICTION_PROBS];
-  vp9_prob ref_pred_probs[PREDICTION_PROBS];
-  vp9_prob prob_comppred[COMP_PRED_CONTEXTS];
+  vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
+  vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];
+  vp9_prob single_ref_prob[REF_CONTEXTS][2];
+  vp9_prob comp_ref_prob[REF_CONTEXTS];
 
   unsigned char *last_frame_seg_map_copy;
 
   // 0 = Intra, Last, GF, ARF
   signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];
-  // 0 = BPRED, ZERO_MV, MV, SPLIT
+  // 0 = ZERO_MV, MV
   signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
 
-  vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES];
-  vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES];
-  vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES];
-  vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES];
+  vp9_coeff_probs_model coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES];
 
-  vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1];
-  vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
-  vp9_prob uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];
-  vp9_prob bmode_prob[VP9_NKF_BINTRAMODES - 1];
-  vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1];
-  vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
-  vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1];
+  vp9_prob y_mode_prob[4][VP9_INTRA_MODES - 1];
+  vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
+  vp9_prob partition_prob[2][NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
 
   vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
                                  [VP9_SWITCHABLE_FILTERS - 1];
-#if CONFIG_COMP_INTERINTRA_PRED
-  vp9_prob interintra_prob;
-#endif
 
-  int mv_ref_ct[INTER_MODE_CONTEXTS][4][2];
-  int vp9_mode_contexts[INTER_MODE_CONTEXTS][4];
+  int inter_mode_counts[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2];
+  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];
 
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_prob nzc_probs_4x4
-           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES];
-  vp9_prob nzc_probs_8x8
-           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES];
-  vp9_prob nzc_probs_16x16
-           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES];
-  vp9_prob nzc_probs_32x32
-           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES];
-  vp9_prob nzc_pcat_probs[MAX_NZC_CONTEXTS]
-                         [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA];
-#endif
+  vp9_prob tx_probs_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3];
+  vp9_prob tx_probs_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
+  vp9_prob tx_probs_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
+  vp9_prob mbskip_probs[MBSKIP_CONTEXTS];
 } CODING_CONTEXT;
 
 typedef struct {
@@ -142,8 +115,7 @@
   double new_mv_count;
   double duration;
   double count;
-}
-FIRSTPASS_STATS;
+} FIRSTPASS_STATS;
 
 typedef struct {
   int frames_so_far;
@@ -155,7 +127,6 @@
   double frame_mvr_abs;
   double frame_mvc;
   double frame_mvc_abs;
-
 } ONEPASS_FRAMESTATS;
 
 typedef struct {
@@ -207,12 +178,7 @@
   THR_SPLITA,
 
   THR_B_PRED,
-  THR_I8X8_PRED,
 
-  THR_COMP_ZEROLG,
-  THR_COMP_NEARESTLG,
-  THR_COMP_NEARLG,
-
   THR_COMP_ZEROLA,
   THR_COMP_NEARESTLA,
   THR_COMP_NEARLA,
@@ -221,32 +187,13 @@
   THR_COMP_NEARESTGA,
   THR_COMP_NEARGA,
 
-  THR_COMP_NEWLG,
   THR_COMP_NEWLA,
   THR_COMP_NEWGA,
 
-  THR_COMP_SPLITLG,
   THR_COMP_SPLITLA,
   THR_COMP_SPLITGA,
-#if CONFIG_COMP_INTERINTRA_PRED
-  THR_COMP_INTERINTRA_ZEROL,
-  THR_COMP_INTERINTRA_NEARESTL,
-  THR_COMP_INTERINTRA_NEARL,
-  THR_COMP_INTERINTRA_NEWL,
+} THR_MODES;
 
-  THR_COMP_INTERINTRA_ZEROG,
-  THR_COMP_INTERINTRA_NEARESTG,
-  THR_COMP_INTERINTRA_NEARG,
-  THR_COMP_INTERINTRA_NEWG,
-
-  THR_COMP_INTERINTRA_ZEROA,
-  THR_COMP_INTERINTRA_NEARESTA,
-  THR_COMP_INTERINTRA_NEARA,
-  THR_COMP_INTERINTRA_NEWA,
-#endif
-}
-THR_MODES;
-
 typedef enum {
   DIAMOND = 0,
   NSTEP = 1,
@@ -256,7 +203,6 @@
 typedef struct {
   int RD;
   SEARCH_METHODS search_method;
-  int improved_dct;
   int auto_filter;
   int recode_loop;
   int iterative_sub_pixel;
@@ -266,41 +212,25 @@
   int max_step_search_steps;
   int first_step;
   int optimize_coefficients;
-  int no_skip_block4x4_search;
   int search_best_filter;
-  int splitmode_breakout;
-  int mb16_breakout;
   int static_segmentation;
+  int comp_inter_joint_search_thresh;
+  int adpative_rd_thresh;
 } SPEED_FEATURES;
 
-typedef struct {
-  MACROBLOCK  mb;
-  int totalrate;
-} MB_ROW_COMP;
-
-typedef struct {
-  TOKENEXTRA *start;
-  TOKENEXTRA *stop;
-} TOKENLIST;
-
-typedef struct {
-  int ithread;
-  void *ptr1;
-  void *ptr2;
-} ENCODETHREAD_DATA;
-typedef struct {
-  int ithread;
-  void *ptr1;
-} LPFTHREAD_DATA;
-
 enum BlockSize {
-  BLOCK_16X8 = PARTITIONING_16X8,
-  BLOCK_8X16 = PARTITIONING_8X16,
-  BLOCK_8X8 = PARTITIONING_8X8,
-  BLOCK_4X4 = PARTITIONING_4X4,
+  BLOCK_4X4,
+  BLOCK_4X8,
+  BLOCK_8X4,
+  BLOCK_8X8,
+  BLOCK_8X16,
+  BLOCK_16X8,
   BLOCK_16X16,
-  BLOCK_MAX_SEGMENTS,
-  BLOCK_32X32 = BLOCK_MAX_SEGMENTS,
+  BLOCK_32X32,
+  BLOCK_32X16,
+  BLOCK_16X32,
+  BLOCK_64X32,
+  BLOCK_32X64,
   BLOCK_64X64,
   BLOCK_MAX_SB_SEGMENTS,
 };
@@ -307,17 +237,25 @@
 
 typedef struct VP9_COMP {
 
-  DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, y_quant[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, unsigned char, y_quant_shift[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, y_zbin[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, y_round[QINDEX_RANGE][16]);
 
-  DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, uv_quant[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, unsigned char, uv_quant_shift[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, uv_zbin[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, uv_round[QINDEX_RANGE][16]);
 
-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);
+#if CONFIG_ALPHA
+  DECLARE_ALIGNED(16, short, a_quant[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, unsigned char, a_quant_shift[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, a_zbin[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, short, a_round[QINDEX_RANGE][16]);
+
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_a[QINDEX_RANGE][16]);
+#endif
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y[QINDEX_RANGE][16]);
   DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
 
   MACROBLOCK mb;
@@ -326,7 +264,11 @@
 
   struct lookahead_ctx    *lookahead;
   struct lookahead_entry  *source;
+#if CONFIG_MULTIPLE_ARF
+  struct lookahead_entry  *alt_ref_source[NUM_REF_FRAMES];
+#else
   struct lookahead_entry  *alt_ref_source;
+#endif
 
   YV12_BUFFER_CONFIG *Source;
   YV12_BUFFER_CONFIG *un_scaled_source;
@@ -345,6 +287,9 @@
   int lst_fb_idx;
   int gld_fb_idx;
   int alt_fb_idx;
+#if CONFIG_MULTIPLE_ARF
+  int alt_ref_fb_idx[NUM_REF_FRAMES - 3];
+#endif
   int refresh_last_frame;
   int refresh_golden_frame;
   int refresh_alt_ref_frame;
@@ -358,6 +303,12 @@
   unsigned int key_frame_frequency;
   unsigned int this_key_frame_forced;
   unsigned int next_key_frame_forced;
+#if CONFIG_MULTIPLE_ARF
+  // Position within a frame coding order (including any additional ARF frames).
+  unsigned int sequence_number;
+  // Next frame in naturally occurring order that has not yet been coded.
+  int next_frame_in_order;
+#endif
 
   // Ambient reconstruction err target for force key frames
   int ambient_err;
@@ -367,16 +318,19 @@
   unsigned int mode_chosen_counts[MAX_MODES];
 
   int rd_thresh_mult[MAX_MODES];
-  int rd_baseline_thresh[MAX_MODES];
-  int rd_threshes[MAX_MODES];
+  int rd_baseline_thresh[BLOCK_SIZE_TYPES][MAX_MODES];
+  int rd_threshes[BLOCK_SIZE_TYPES][MAX_MODES];
+  int rd_thresh_freq_fact[BLOCK_SIZE_TYPES][MAX_MODES];
+
   int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES];
   int rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];
-  int comp_pred_count[COMP_PRED_CONTEXTS];
-  int single_pred_count[COMP_PRED_CONTEXTS];
+  unsigned int intra_inter_count[INTRA_INTER_CONTEXTS][2];
+  unsigned int comp_inter_count[COMP_INTER_CONTEXTS][2];
+  unsigned int single_ref_count[REF_CONTEXTS][2][2];
+  unsigned int comp_ref_count[REF_CONTEXTS][2];
+
   // FIXME contextualize
-  int txfm_count_32x32p[TX_SIZE_MAX_SB];
-  int txfm_count_16x16p[TX_SIZE_MAX_MB];
-  int txfm_count_8x8p[TX_SIZE_MAX_MB - 1];
+
   int64_t rd_tx_select_diff[NB_TXFM_MODES];
   int rd_tx_select_threshes[4][NB_TXFM_MODES];
 
@@ -396,7 +350,6 @@
   double gf_rate_correction_factor;
 
   int frames_till_gf_update_due;      // Count down till next GF
-  int current_gf_interval;          // GF interval chosen when we coded the last GF
 
   int gf_overspend_bits;            // Total bits overspent becasue of GF boost (cumulative)
 
@@ -453,57 +406,16 @@
 
   int cq_target_quality;
 
-  int sb32_count[2];
-  int sb64_count[2];
-  int sb_ymode_count [VP9_I32X32_MODES];
-  int ymode_count[VP9_YMODES];        /* intra MB type cts this frame */
-  int bmode_count[VP9_NKF_BINTRAMODES];
-  int i8x8_mode_count[VP9_I8X8_MODES];
-  int sub_mv_ref_count[SUBMVREF_COUNT][VP9_SUBMVREFS];
-  int mbsplit_count[VP9_NUMMBSPLITS];
-  int y_uv_mode_count[VP9_YMODES][VP9_UV_MODES];
-#if CONFIG_COMP_INTERINTRA_PRED
-  unsigned int interintra_count[2];
-  unsigned int interintra_select_count[2];
-#endif
+  int y_mode_count[4][VP9_INTRA_MODES];
+  int y_uv_mode_count[VP9_INTRA_MODES][VP9_INTRA_MODES];
+  unsigned int partition_count[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 
   nmv_context_counts NMVcount;
 
-  vp9_coeff_count coef_counts_4x4[BLOCK_TYPES];
-  vp9_coeff_probs frame_coef_probs_4x4[BLOCK_TYPES];
-  vp9_coeff_stats frame_branch_ct_4x4[BLOCK_TYPES];
+  vp9_coeff_count coef_counts[TX_SIZE_MAX_SB][BLOCK_TYPES];
+  vp9_coeff_probs_model frame_coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES];
+  vp9_coeff_stats frame_branch_ct[TX_SIZE_MAX_SB][BLOCK_TYPES];
 
-  vp9_coeff_count coef_counts_8x8[BLOCK_TYPES];
-  vp9_coeff_probs frame_coef_probs_8x8[BLOCK_TYPES];
-  vp9_coeff_stats frame_branch_ct_8x8[BLOCK_TYPES];
-
-  vp9_coeff_count coef_counts_16x16[BLOCK_TYPES];
-  vp9_coeff_probs frame_coef_probs_16x16[BLOCK_TYPES];
-  vp9_coeff_stats frame_branch_ct_16x16[BLOCK_TYPES];
-
-  vp9_coeff_count coef_counts_32x32[BLOCK_TYPES];
-  vp9_coeff_probs frame_coef_probs_32x32[BLOCK_TYPES];
-  vp9_coeff_stats frame_branch_ct_32x32[BLOCK_TYPES];
-
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_prob frame_nzc_probs_4x4
-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES];
-  unsigned int frame_nzc_branch_ct_4x4
-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES][2];
-  vp9_prob frame_nzc_probs_8x8
-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES];
-  unsigned int frame_nzc_branch_ct_8x8
-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES][2];
-  vp9_prob frame_nzc_probs_16x16
-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES];
-  unsigned int frame_nzc_branch_ct_16x16
-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES][2];
-  vp9_prob frame_nzc_probs_32x32
-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES];
-  unsigned int frame_nzc_branch_ct_32x32
-      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES][2];
-#endif
-
   int gfu_boost;
   int last_boost;
   int kf_boost;
@@ -521,7 +433,6 @@
   int mbgraph_n_frames;             // number of frames filled in the above
   int static_mb_pct;                // % forced skip mbs by segmentation
   int seg0_progress, seg0_idx, seg0_cnt;
-  int ref_pred_count[3][2];
 
   int decimation_factor;
   int decimation_count;
@@ -529,7 +440,7 @@
   // for real time encoding
   int avg_encode_time;              // microsecond
   int avg_pick_mode_time;            // microsecond
-  int Speed;
+  int speed;
   unsigned int cpu_freq;           // Mhz
   int compressor_speed;
 
@@ -542,12 +453,8 @@
   vp9_prob last_skip_false_probs[3][MBSKIP_CONTEXTS];
   int last_skip_probs_q[3];
 
-  int recent_ref_frame_usage[MAX_REF_FRAMES];
-  int count_mb_ref_frame_usage[MAX_REF_FRAMES];
   int ref_frame_flags;
 
-  unsigned char ref_pred_probs_update[PREDICTION_PROBS];
-
   SPEED_FEATURES sf;
   int error_bins[1024];
 
@@ -555,8 +462,6 @@
   int inter_zz_count;
   int gf_bad_count;
   int gf_update_recommended;
-  int skip_true_count[3];
-  int skip_false_count[3];
 
   unsigned char *segmentation_map;
 
@@ -566,8 +471,6 @@
   unsigned char *active_map;
   unsigned int active_map_enabled;
 
-  TOKENLIST *tplist;
-
   fractional_mv_step_fp *find_fractional_mv_step;
   vp9_full_search_fn_t full_search_sad;
   vp9_refining_search_fn_t refining_search_sad;
@@ -578,16 +481,14 @@
   uint64_t time_pick_lpf;
   uint64_t time_encode_mb_row;
 
-  int base_skip_false_prob[QINDEX_RANGE][3];
-
   struct twopass_rc {
     unsigned int section_intra_rating;
     unsigned int next_iiratio;
     unsigned int this_iiratio;
-    FIRSTPASS_STATS *total_stats;
-    FIRSTPASS_STATS *this_frame_stats;
+    FIRSTPASS_STATS total_stats;
+    FIRSTPASS_STATS this_frame_stats;
     FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;
-    FIRSTPASS_STATS *total_left_stats;
+    FIRSTPASS_STATS total_left_stats;
     int first_pass_done;
     int64_t bits_left;
     int64_t clip_bits_total;
@@ -640,6 +541,8 @@
   int    bytes;
   double summed_quality;
   double summed_weights;
+  double summedp_quality;
+  double summedp_weights;
   unsigned int tot_recode_hits;
 
 
@@ -656,19 +559,8 @@
   unsigned int activity_avg;
   unsigned int *mb_activity_map;
   int *mb_norm_activity_map;
-
-  // Record of which MBs still refer to last golden frame either
-  // directly or through 0,0
-  unsigned char *gf_active_flags;
-  int gf_active_count;
-
   int output_partition;
 
-  // Store last frame's MV info for next frame MV prediction
-  int_mv *lfmv;
-  int *lf_ref_frame_sign_bias;
-  int *lf_ref_frame;
-
   /* force next frame to intra when kf_auto says so */
   int force_next_frame_intra;
 
@@ -680,13 +572,36 @@
                                       [VP9_SWITCHABLE_FILTERS];
   unsigned int best_switchable_interp_count[VP9_SWITCHABLE_FILTERS];
 
-#if CONFIG_NEW_MVREF
-  unsigned int mb_mv_ref_count[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
-#endif
-
   int initial_width;
   int initial_height;
+
+#if CONFIG_MULTIPLE_ARF
+  // ARF tracking variables.
+  int multi_arf_enabled;
+  unsigned int frame_coding_order_period;
+  unsigned int new_frame_coding_order_period;
+  int frame_coding_order[MAX_LAG_BUFFERS * 2];
+  int arf_buffer_idx[MAX_LAG_BUFFERS * 3 / 2];
+  int arf_weight[MAX_LAG_BUFFERS];
+  int arf_buffered;
+  int this_frame_weight;
+  int max_arf_level;
+#endif
+
+#ifdef ENTROPY_STATS
+  int64_t mv_ref_stats[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2];
+#endif
 } VP9_COMP;
+
+static int get_ref_frame_idx(VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
+  if (ref_frame == LAST_FRAME) {
+    return cpi->lst_fb_idx;
+  } else if (ref_frame == GOLDEN_FRAME) {
+    return cpi->gld_fb_idx;
+  } else {
+    return cpi->alt_fb_idx;
+  }
+}
 
 void vp9_encode_frame(VP9_COMP *cpi);
 
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -120,112 +120,7 @@
   return max_filter_level;
 }
 
-void vp9_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
 
-  int best_err = 0;
-  int filt_err = 0;
-  int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
-  int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
-  int filt_val;
-  int best_filt_val = cm->filter_level;
-
-  //  Make a copy of the unfiltered / processed recon buffer
-  vp9_yv12_copy_partial_frame(cm->frame_to_show, &cpi->last_frame_uf, 3);
-
-  if (cm->frame_type == KEY_FRAME)
-    cm->sharpness_level = 0;
-  else
-    cm->sharpness_level = cpi->oxcf.Sharpness;
-
-  if (cm->sharpness_level != cm->last_sharpness_level) {
-    vp9_loop_filter_update_sharpness(&cm->lf_info, cm->sharpness_level);
-    cm->last_sharpness_level = cm->sharpness_level;
-  }
-
-  // Start the search at the previous frame filter level unless it is now out of range.
-  if (cm->filter_level < min_filter_level)
-    cm->filter_level = min_filter_level;
-  else if (cm->filter_level > max_filter_level)
-    cm->filter_level = max_filter_level;
-
-  filt_val = cm->filter_level;
-  best_filt_val = filt_val;
-
-  // Get the err using the previous frame's filter value.
-  vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
-
-  best_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);
-
-  //  Re-instate the unfiltered frame
-  vp9_yv12_copy_partial_frame(&cpi->last_frame_uf, cm->frame_to_show, 3);
-
-  filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
-
-  // Search lower filter levels
-  while (filt_val >= min_filter_level) {
-    // Apply the loop filter
-    vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
-
-    // Get the err for filtered frame
-    filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);
-
-    //  Re-instate the unfiltered frame
-    vp9_yv12_copy_partial_frame(&cpi->last_frame_uf, cm->frame_to_show, 3);
-
-
-    // Update the best case record or exit loop.
-    if (filt_err < best_err) {
-      best_err = filt_err;
-      best_filt_val = filt_val;
-    } else
-      break;
-
-    // Adjust filter level
-    filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
-  }
-
-  // Search up (note that we have already done filt_val = cm->filter_level)
-  filt_val = cm->filter_level + (1 + ((filt_val > 10) ? 1 : 0));
-
-  if (best_filt_val == cm->filter_level) {
-    // Resist raising filter level for very small gains
-    best_err -= (best_err >> 10);
-
-    while (filt_val < max_filter_level) {
-      // Apply the loop filter
-      vp9_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
-
-      // Get the err for filtered frame
-      filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);
-
-      //  Re-instate the unfiltered frame
-      vp9_yv12_copy_partial_frame(&cpi->last_frame_uf,
-                                      cm->frame_to_show, 3);
-
-      // Update the best case record or exit loop.
-      if (filt_err < best_err) {
-        // Do not raise filter level if improvement is < 1 part in 4096
-        best_err = filt_err - (filt_err >> 10);
-
-        best_filt_val = filt_val;
-      } else
-        break;
-
-      // Adjust filter level
-      filt_val += (1 + ((filt_val > 10) ? 1 : 0));
-    }
-  }
-
-  cm->filter_level = best_filt_val;
-
-  if (cm->filter_level < min_filter_level)
-    cm->filter_level = min_filter_level;
-
-  if (cm->filter_level > max_filter_level)
-    cm->filter_level = max_filter_level;
-}
-
 // Stub function for now Alt LF not used
 void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val) {
 }
@@ -268,7 +163,7 @@
 
   // Get baseline error score
   vp9_set_alt_lf_level(cpi, filt_mid);
-  vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1, 0);
+  vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1);
 
   best_err = vp9_calc_ss_err(sd, cm->frame_to_show);
   filt_best = filt_mid;
@@ -293,7 +188,7 @@
     if ((filt_direction <= 0) && (filt_low != filt_mid)) {
       // Get Low filter error score
       vp9_set_alt_lf_level(cpi, filt_low);
-      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1, 0);
+      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1);
 
       filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
 
@@ -313,7 +208,7 @@
     // Now look at filt_high
     if ((filt_direction >= 0) && (filt_high != filt_mid)) {
       vp9_set_alt_lf_level(cpi, filt_high);
-      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1, 0);
+      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1);
 
       filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
 
@@ -338,30 +233,4 @@
   }
 
   cm->filter_level = filt_best;
-
-#if CONFIG_LOOP_DERING
-  /* Decide whether to turn on deringing filter */
-  {  // NOLINT
-    int best_dering = 0;
-    int this_dering;
-    int last_err_diff = INT_MAX;
-
-    for (this_dering = 1; this_dering <= 16; this_dering++) {
-      vp9_set_alt_lf_level(cpi, filt_best);
-      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1, this_dering);
-      filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
-      vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
-      if (filt_err < best_err) {
-        best_err = filt_err;
-        best_dering = this_dering;
-        last_err_diff = INT_MAX;
-      } else {
-        if (filt_err - best_err > last_err_diff)
-          break;
-        last_err_diff = filt_err - best_err;
-      }
-    }
-    cm->dering_enabled = best_dering;
-  }
-#endif
 }
--- a/vp9/encoder/vp9_picklpf.h
+++ b/vp9/encoder/vp9_picklpf.h
@@ -15,9 +15,6 @@
 struct yv12_buffer_config;
 struct VP9_COMP;
 
-void vp9_pick_filter_level_fast(struct yv12_buffer_config *sd,
-                                struct VP9_COMP *cpi);
-
 void vp9_set_alt_lf_level(struct VP9_COMP *cpi, int filt_val);
 
 void vp9_pick_filter_level(struct yv12_buffer_config *sd,
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -21,340 +21,12 @@
 extern int enc_debug;
 #endif
 
-static INLINE int plane_idx(MACROBLOCKD *xd, int b_idx) {
-  const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
-  if (b_idx < (16 << (sb_type * 2)))
-    return 0;  // Y
-  else if (b_idx < (20 << (sb_type * 2)))
-    return 16;  // U
-  assert(b_idx < (24 << (sb_type * 2)));
-  return 20;  // V
+static INLINE int plane_idx(int plane) {
+  return plane == 0 ? 0 :
+         plane == 1 ? 16 : 20;
 }
 
-void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  BLOCK *const b = &mb->block[0];
-  BLOCKD *const d = &xd->block[0];
-  int i, rc, eob;
-  int zbin;
-  int x, y, z, sz;
-  int16_t *coeff_ptr       = mb->coeff + b_idx * 16;
-  int16_t *qcoeff_ptr      = xd->qcoeff + b_idx * 16;
-  int16_t *dqcoeff_ptr     = xd->dqcoeff + b_idx * 16;
-  int16_t *zbin_boost_ptr  = b->zrun_zbin_boost;
-  int16_t *zbin_ptr        = b->zbin;
-  int16_t *round_ptr       = b->round;
-  int16_t *quant_ptr       = b->quant;
-  uint8_t *quant_shift_ptr = b->quant_shift;
-  int16_t *dequant_ptr     = d->dequant;
-  int zbin_oq_value        = b->zbin_extra;
-  const int *pt_scan;
-#if CONFIG_CODE_NONZEROCOUNT
-  int nzc = 0;
-#endif
-
-  assert(plane_idx(xd, b_idx) == 0);
-  switch (tx_type) {
-    case ADST_DCT:
-      pt_scan = vp9_row_scan_4x4;
-      break;
-    case DCT_ADST:
-      pt_scan = vp9_col_scan_4x4;
-      break;
-    default:
-      pt_scan = vp9_default_zig_zag1d_4x4;
-      break;
-  }
-
-  vpx_memset(qcoeff_ptr, 0, 32);
-  vpx_memset(dqcoeff_ptr, 0, 32);
-
-  eob = -1;
-
-  if (!b->skip_block) {
-    for (i = 0; i < 16; i++) {
-      rc   = pt_scan[i];
-      z    = coeff_ptr[rc];
-
-      zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
-      zbin_boost_ptr++;
-
-      sz = (z >> 31);                                 // sign of z
-      x  = (z ^ sz) - sz;                             // x = abs(z)
-
-      if (x >= zbin) {
-        x += round_ptr[rc];
-        y  = (((x * quant_ptr[rc]) >> 16) + x)
-             >> quant_shift_ptr[rc];                // quantize (x)
-        x  = (y ^ sz) - sz;                         // get the sign back
-        qcoeff_ptr[rc]  = x;                        // write to destination
-        dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value
-
-        if (y) {
-          eob = i;                                // last nonzero coeffs
-#if CONFIG_CODE_NONZEROCOUNT
-          ++nzc;                                  // number of nonzero coeffs
-#endif
-          zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength
-        }
-      }
-    }
-  }
-
-  xd->eobs[b_idx] = eob + 1;
-#if CONFIG_CODE_NONZEROCOUNT
-  xd->nzcs[b_idx] = nzc;
-#endif
-}
-
-void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  const int c_idx = plane_idx(xd, b_idx);
-  BLOCK *const b = &mb->block[c_idx];
-  BLOCKD *const d = &xd->block[c_idx];
-  int i, rc, eob;
-  int zbin;
-  int x, y, z, sz;
-  int16_t *coeff_ptr       = mb->coeff + b_idx * 16;
-  int16_t *qcoeff_ptr      = xd->qcoeff + b_idx * 16;
-  int16_t *dqcoeff_ptr     = xd->dqcoeff + b_idx * 16;
-  int16_t *zbin_boost_ptr  = b->zrun_zbin_boost;
-  int16_t *zbin_ptr        = b->zbin;
-  int16_t *round_ptr       = b->round;
-  int16_t *quant_ptr       = b->quant;
-  uint8_t *quant_shift_ptr = b->quant_shift;
-  int16_t *dequant_ptr     = d->dequant;
-  int zbin_oq_value        = b->zbin_extra;
-#if CONFIG_CODE_NONZEROCOUNT
-  int nzc = 0;
-#endif
-
-  vpx_memset(qcoeff_ptr, 0, 32);
-  vpx_memset(dqcoeff_ptr, 0, 32);
-
-  eob = -1;
-
-  if (!b->skip_block) {
-    for (i = 0; i < 16; i++) {
-      rc   = vp9_default_zig_zag1d_4x4[i];
-      z    = coeff_ptr[rc];
-
-      zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
-      zbin_boost_ptr++;
-
-      sz = (z >> 31);                                 // sign of z
-      x  = (z ^ sz) - sz;                             // x = abs(z)
-
-      if (x >= zbin) {
-        x += round_ptr[rc];
-
-        y  = (((x * quant_ptr[rc]) >> 16) + x)
-             >> quant_shift_ptr[rc];                // quantize (x)
-        x  = (y ^ sz) - sz;                         // get the sign back
-        qcoeff_ptr[rc]  = x;                        // write to destination
-        dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value
-
-        if (y) {
-          eob = i;                                // last nonzero coeffs
-#if CONFIG_CODE_NONZEROCOUNT
-          ++nzc;                                  // number of nonzero coeffs
-#endif
-          zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength
-        }
-      }
-    }
-  }
-
-  xd->eobs[b_idx] = eob + 1;
-#if CONFIG_CODE_NONZEROCOUNT
-  xd->nzcs[b_idx] = nzc;
-#endif
-}
-
-void vp9_quantize_mby_4x4(MACROBLOCK *x) {
-  int i;
-
-  for (i = 0; i < 16; i++) {
-    TX_TYPE tx_type = get_tx_type_4x4(&x->e_mbd, i);
-    if (tx_type != DCT_DCT) {
-      vp9_ht_quantize_b_4x4(x, i, tx_type);
-    } else {
-      x->quantize_b_4x4(x, i);
-    }
-  }
-}
-
-void vp9_quantize_mbuv_4x4(MACROBLOCK *x) {
-  int i;
-
-  for (i = 16; i < 24; i++)
-    x->quantize_b_4x4(x, i);
-}
-
-void vp9_quantize_mb_4x4(MACROBLOCK *x) {
-  vp9_quantize_mby_4x4(x);
-  vp9_quantize_mbuv_4x4(x);
-}
-
-void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  int16_t *qcoeff_ptr = xd->qcoeff + 16 * b_idx;
-  int16_t *dqcoeff_ptr = xd->dqcoeff + 16 * b_idx;
-  const int c_idx = plane_idx(xd, b_idx);
-  BLOCK *const b = &mb->block[c_idx];
-  BLOCKD *const d = &xd->block[c_idx];
-  const int *pt_scan;
-
-  switch (tx_type) {
-    case ADST_DCT:
-      pt_scan = vp9_row_scan_8x8;
-      break;
-    case DCT_ADST:
-      pt_scan = vp9_col_scan_8x8;
-      break;
-    default:
-      pt_scan = vp9_default_zig_zag1d_8x8;
-      break;
-  }
-
-  vpx_memset(qcoeff_ptr, 0, 64 * sizeof(int16_t));
-  vpx_memset(dqcoeff_ptr, 0, 64 * sizeof(int16_t));
-
-  if (!b->skip_block) {
-    int i, rc, eob;
-    int zbin;
-    int x, y, z, sz;
-    int zero_run;
-    int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
-    int16_t *coeff_ptr  = mb->coeff + 16 * b_idx;
-    int16_t *zbin_ptr   = b->zbin;
-    int16_t *round_ptr  = b->round;
-    int16_t *quant_ptr  = b->quant;
-    uint8_t *quant_shift_ptr = b->quant_shift;
-    int16_t *dequant_ptr = d->dequant;
-    int zbin_oq_value = b->zbin_extra;
-#if CONFIG_CODE_NONZEROCOUNT
-    int nzc = 0;
-#endif
-
-    eob = -1;
-
-    // Special case for DC as it is the one triggering access in various
-    // tables: {zbin, quant, quant_shift, dequant}_ptr[rc != 0]
-    {
-      z    = coeff_ptr[0];
-      zbin = (zbin_ptr[0] + zbin_boost_ptr[0] + zbin_oq_value);
-      zero_run = 1;
-
-      sz = (z >> 31);                                // sign of z
-      x  = (z ^ sz) - sz;                            // x = abs(z)
-
-      if (x >= zbin) {
-        x += (round_ptr[0]);
-        y  = ((int)(((int)(x * quant_ptr[0]) >> 16) + x))
-             >> quant_shift_ptr[0];                  // quantize (x)
-        x  = (y ^ sz) - sz;                          // get the sign back
-        qcoeff_ptr[0]  = x;                          // write to destination
-        dqcoeff_ptr[0] = x * dequant_ptr[0];         // dequantized value
-
-        if (y) {
-          eob = 0;                                   // last nonzero coeffs
-#if CONFIG_CODE_NONZEROCOUNT
-          ++nzc;                                  // number of nonzero coeffs
-#endif
-          zero_run = 0;
-        }
-      }
-    }
-    for (i = 1; i < 64; i++) {
-      rc   = pt_scan[i];
-      z    = coeff_ptr[rc];
-      zbin = (zbin_ptr[1] + zbin_boost_ptr[zero_run] + zbin_oq_value);
-      // The original code was incrementing zero_run while keeping it at
-      // maximum 15 by adding "(zero_run < 15)". The same is achieved by
-      // removing the opposite of the sign mask of "(zero_run - 15)".
-      zero_run -= (zero_run - 15) >> 31;
-
-      sz = (z >> 31);                                // sign of z
-      x  = (z ^ sz) - sz;                            // x = abs(z)
-
-      if (x >= zbin) {
-        x += (round_ptr[rc != 0]);
-        y  = ((int)(((int)(x * quant_ptr[1]) >> 16) + x))
-             >> quant_shift_ptr[1];                  // quantize (x)
-        x  = (y ^ sz) - sz;                          // get the sign back
-        qcoeff_ptr[rc]  = x;                         // write to destination
-        dqcoeff_ptr[rc] = x * dequant_ptr[1];        // dequantized value
-
-        if (y) {
-          eob = i;                                   // last nonzero coeffs
-#if CONFIG_CODE_NONZEROCOUNT
-          ++nzc;                                     // number of nonzero coeffs
-#endif
-          zero_run = 0;
-        }
-      }
-    }
-    xd->eobs[b_idx] = eob + 1;
-#if CONFIG_CODE_NONZEROCOUNT
-    xd->nzcs[b_idx] = nzc;
-#endif
-  } else {
-    xd->eobs[b_idx] = 0;
-#if CONFIG_CODE_NONZEROCOUNT
-    xd->nzcs[b_idx] = 0;
-#endif
-  }
-}
-
-void vp9_quantize_mby_8x8(MACROBLOCK *x) {
-  int i;
-
-#if CONFIG_CODE_NONZEROCOUNT
-  for (i = 0; i < 16; i ++) {
-    x->e_mbd.nzcs[i] = 0;
-  }
-#endif
-  for (i = 0; i < 16; i += 4) {
-    TX_TYPE tx_type = get_tx_type_8x8(&x->e_mbd, (i & 8) + ((i & 4) >> 1));
-    x->quantize_b_8x8(x, i, tx_type);
-  }
-}
-
-void vp9_quantize_mbuv_8x8(MACROBLOCK *x) {
-  int i;
-
-#if CONFIG_CODE_NONZEROCOUNT
-  for (i = 16; i < 24; i ++) {
-    x->e_mbd.nzcs[i] = 0;
-  }
-#endif
-  for (i = 16; i < 24; i += 4)
-    x->quantize_b_8x8(x, i, DCT_DCT);
-}
-
-void vp9_quantize_mb_8x8(MACROBLOCK *x) {
-  vp9_quantize_mby_8x8(x);
-  vp9_quantize_mbuv_8x8(x);
-}
-
-void vp9_quantize_mby_16x16(MACROBLOCK *x) {
-  TX_TYPE tx_type = get_tx_type_16x16(&x->e_mbd, 0);
-#if CONFIG_CODE_NONZEROCOUNT
-  int i;
-  for (i = 0; i < 16; i++) {
-    x->e_mbd.nzcs[i] = 0;
-  }
-#endif
-  x->quantize_b_16x16(x, 0, tx_type);
-}
-
-void vp9_quantize_mb_16x16(MACROBLOCK *x) {
-  vp9_quantize_mby_16x16(x);
-  vp9_quantize_mbuv_8x8(x);
-}
-
-static void quantize(int16_t *zbin_boost_orig_ptr,
+static void quantize(int16_t *zbin_boost_orig_ptr,
                      int16_t *coeff_ptr, int n_coeffs, int skip_block,
                      int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
                      uint8_t *quant_shift_ptr,
@@ -361,9 +33,6 @@
                      int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
                      int16_t *dequant_ptr, int zbin_oq_value,
                      uint16_t *eob_ptr,
-#if CONFIG_CODE_NONZEROCOUNT
-                     uint16_t *nzc_ptr,
-#endif
                      const int *scan, int mul) {
   int i, rc, eob;
   int zbin;
@@ -370,9 +39,6 @@
   int x, y, z, sz;
   int zero_run = 0;
   int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
-#if CONFIG_CODE_NONZEROCOUNT
-  int nzc = 0;
-#endif
 
   vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
   vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
@@ -401,9 +67,6 @@
         if (y) {
           eob = i;                                  // last nonzero coeffs
           zero_run = 0;
-#if CONFIG_CODE_NONZEROCOUNT
-          ++nzc;                                    // number of nonzero coeffs
-#endif
         }
       }
     }
@@ -410,200 +73,66 @@
   }
 
   *eob_ptr = eob + 1;
-#if CONFIG_CODE_NONZEROCOUNT
-  *nzc_ptr = nzc;
-#endif
 }
 
-void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {
+void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
+                  TX_TYPE tx_type) {
   MACROBLOCKD *const xd = &mb->e_mbd;
-  const int c_idx = plane_idx(xd, b_idx);
-  BLOCK *const b = &mb->block[c_idx];
-  BLOCKD *const d = &xd->block[c_idx];
-  const int *pt_scan;
+  const int mul = n_coeffs == 1024 ? 2 : 1;
+  const int *scan;
 
-  switch (tx_type) {
-    case ADST_DCT:
-      pt_scan = vp9_row_scan_16x16;
+  // These contexts may be available in the caller
+  switch (n_coeffs) {
+    case 4 * 4:
+      scan = get_scan_4x4(tx_type);
       break;
-    case DCT_ADST:
-      pt_scan = vp9_col_scan_16x16;
+    case 8 * 8:
+      scan = get_scan_8x8(tx_type);
       break;
+    case 16 * 16:
+      scan = get_scan_16x16(tx_type);
+      break;
     default:
-      pt_scan = vp9_default_zig_zag1d_16x16;
+      scan = vp9_default_scan_32x32;
       break;
   }
 
-  quantize(b->zrun_zbin_boost,
-           mb->coeff + 16 * b_idx,
-           256, b->skip_block,
-           b->zbin, b->round, b->quant, b->quant_shift,
-           xd->qcoeff + 16 * b_idx,
-           xd->dqcoeff + 16 * b_idx,
-           d->dequant,
-           b->zbin_extra,
-           &xd->eobs[b_idx],
-#if CONFIG_CODE_NONZEROCOUNT
-           &xd->nzcs[b_idx],
-#endif
-           pt_scan, 1);
+  quantize(mb->plane[plane].zrun_zbin_boost,
+           BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
+           n_coeffs, mb->skip_block,
+           mb->plane[plane].zbin,
+           mb->plane[plane].round,
+           mb->plane[plane].quant,
+           mb->plane[plane].quant_shift,
+           BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
+           BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+           xd->plane[plane].dequant,
+           mb->plane[plane].zbin_extra,
+           &xd->plane[plane].eobs[block],
+           scan, mul);
 }
 
-void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx) {
+void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
+                                int y_blocks) {
   MACROBLOCKD *const xd = &mb->e_mbd;
-  const int c_idx = plane_idx(xd, b_idx);
-  BLOCK *const b = &mb->block[c_idx];
-  BLOCKD *const d = &xd->block[c_idx];
+  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
+  const int *pt_scan = get_scan_4x4(tx_type);
 
-  quantize(b->zrun_zbin_boost,
-           mb->coeff + b_idx * 16,
-           1024, b->skip_block,
-           b->zbin,
-           b->round, b->quant, b->quant_shift,
-           xd->qcoeff + b_idx * 16,
-           xd->dqcoeff + b_idx * 16,
-           d->dequant,
-           b->zbin_extra,
-           &xd->eobs[b_idx],
-#if CONFIG_CODE_NONZEROCOUNT
-           &xd->nzcs[b_idx],
-#endif
-           vp9_default_zig_zag1d_32x32, 2);
+  quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
+           BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
+           16, mb->skip_block,
+           mb->plane[pb_idx.plane].zbin,
+           mb->plane[pb_idx.plane].round,
+           mb->plane[pb_idx.plane].quant,
+           mb->plane[pb_idx.plane].quant_shift,
+           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),
+           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),
+           xd->plane[pb_idx.plane].dequant,
+           mb->plane[pb_idx.plane].zbin_extra,
+           &xd->plane[pb_idx.plane].eobs[pb_idx.block],
+           pt_scan, 1);
 }
 
-void vp9_quantize_sby_32x32(MACROBLOCK *x) {
-  vp9_regular_quantize_b_32x32(x, 0);
-}
-
-void vp9_quantize_sby_16x16(MACROBLOCK *x) {
-  int n;
-
-  for (n = 0; n < 4; n++) {
-    TX_TYPE tx_type = get_tx_type_16x16(&x->e_mbd,
-                                        (16 * (n & 2)) + ((n & 1) * 4));
-    x->quantize_b_16x16(x, n * 16, tx_type);
-  }
-}
-
-void vp9_quantize_sby_8x8(MACROBLOCK *x) {
-  int n;
-
-  for (n = 0; n < 16; n++) {
-    TX_TYPE tx_type = get_tx_type_8x8(&x->e_mbd,
-                                      (4 * (n & 12)) + ((n & 3) * 2));
-    x->quantize_b_8x8(x, n * 4, tx_type);
-  }
-}
-
-void vp9_quantize_sby_4x4(MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
-
-  for (n = 0; n < 64; n++) {
-    const TX_TYPE tx_type = get_tx_type_4x4(xd, n);
-    if (tx_type != DCT_DCT) {
-      vp9_ht_quantize_b_4x4(x, n, tx_type);
-    } else {
-      x->quantize_b_4x4(x, n);
-    }
-  }
-}
-
-void vp9_quantize_sbuv_16x16(MACROBLOCK *x) {
-  x->quantize_b_16x16(x, 64, DCT_DCT);
-  x->quantize_b_16x16(x, 80, DCT_DCT);
-}
-
-void vp9_quantize_sbuv_8x8(MACROBLOCK *x) {
-  int i;
-
-  for (i = 64; i < 96; i += 4)
-    x->quantize_b_8x8(x, i, DCT_DCT);
-}
-
-void vp9_quantize_sbuv_4x4(MACROBLOCK *x) {
-  int i;
-
-  for (i = 64; i < 96; i++)
-    x->quantize_b_4x4(x, i);
-}
-
-void vp9_quantize_sb64y_32x32(MACROBLOCK *x) {
-  int n;
-
-  for (n = 0; n < 4; n++)
-    vp9_regular_quantize_b_32x32(x, n * 64);
-}
-
-void vp9_quantize_sb64y_16x16(MACROBLOCK *x) {
-  int n;
-
-  for (n = 0; n < 16; n++) {
-    TX_TYPE tx_type = get_tx_type_16x16(&x->e_mbd,
-                                        (16 * (n & 12)) + ((n & 3) * 4));
-    x->quantize_b_16x16(x, n * 16, tx_type);
-  }
-}
-
-void vp9_quantize_sb64y_8x8(MACROBLOCK *x) {
-  int n;
-
-  for (n = 0; n < 64; n++) {
-    TX_TYPE tx_type = get_tx_type_8x8(&x->e_mbd,
-                                      (4 * (n & 56)) + ((n & 7) * 2));
-    x->quantize_b_8x8(x, n * 4, tx_type);
-  }
-}
-
-void vp9_quantize_sb64y_4x4(MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
-
-  for (n = 0; n < 256; n++) {
-    const TX_TYPE tx_type = get_tx_type_4x4(xd, n);
-    if (tx_type != DCT_DCT) {
-      vp9_ht_quantize_b_4x4(x, n, tx_type);
-    } else {
-      x->quantize_b_4x4(x, n);
-    }
-  }
-}
-
-void vp9_quantize_sb64uv_32x32(MACROBLOCK *x) {
-  vp9_regular_quantize_b_32x32(x, 256);
-  vp9_regular_quantize_b_32x32(x, 320);
-}
-
-void vp9_quantize_sb64uv_16x16(MACROBLOCK *x) {
-  int i;
-
-  for (i = 256; i < 384; i += 16)
-    x->quantize_b_16x16(x, i, DCT_DCT);
-}
-
-void vp9_quantize_sb64uv_8x8(MACROBLOCK *x) {
-  int i;
-
-  for (i = 256; i < 384; i += 4)
-    x->quantize_b_8x8(x, i, DCT_DCT);
-}
-
-void vp9_quantize_sb64uv_4x4(MACROBLOCK *x) {
-  int i;
-
-  for (i = 256; i < 384; i++)
-    x->quantize_b_4x4(x, i);
-}
-
-/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
- * these two C functions if corresponding optimized routine is not available.
- * NEON optimized version implements currently the fast quantization for pair
- * of blocks. */
-void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *x, int b_idx1, int b_idx2) {
-  vp9_regular_quantize_b_4x4(x, b_idx1);
-  vp9_regular_quantize_b_4x4(x, b_idx2);
-}
-
 static void invert_quant(int16_t *quant, uint8_t *shift, int d) {
   unsigned t;
   int l;
@@ -618,6 +147,10 @@
 void vp9_init_quantizer(VP9_COMP *cpi) {
   int i;
   int quant_val;
+  int quant_uv_val;
+#if CONFIG_ALPHA
+  int quant_alpha_val;
+#endif
   int q;
 
   static const int zbin_boost[16] = { 0,  0,  0,  8,  8,  8, 10, 12,
@@ -631,39 +164,63 @@
       qrounding_factor = 64;
     }
     // dc values
-    quant_val = vp9_dc_quant(q, cpi->common.y1dc_delta_q);
-    invert_quant(cpi->Y1quant[q] + 0, cpi->Y1quant_shift[q] + 0, quant_val);
-    cpi->Y1zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
-    cpi->Y1round[q][0] = (qrounding_factor * quant_val) >> 7;
-    cpi->common.Y1dequant[q][0] = quant_val;
-    cpi->zrun_zbin_boost_y1[q][0] = (quant_val * zbin_boost[0]) >> 7;
+    quant_val = vp9_dc_quant(q, cpi->common.y_dc_delta_q);
+    invert_quant(cpi->y_quant[q] + 0, cpi->y_quant_shift[q] + 0, quant_val);
+    cpi->y_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
+    cpi->y_round[q][0] = (qrounding_factor * quant_val) >> 7;
+    cpi->common.y_dequant[q][0] = quant_val;
+    cpi->zrun_zbin_boost_y[q][0] = (quant_val * zbin_boost[0]) >> 7;
 
-    quant_val = vp9_dc_uv_quant(q, cpi->common.uvdc_delta_q);
-    invert_quant(cpi->UVquant[q] + 0, cpi->UVquant_shift[q] + 0, quant_val);
-    cpi->UVzbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
-    cpi->UVround[q][0] = (qrounding_factor * quant_val) >> 7;
-    cpi->common.UVdequant[q][0] = quant_val;
+    quant_val = vp9_dc_quant(q, cpi->common.uv_dc_delta_q);
+    invert_quant(cpi->uv_quant[q] + 0, cpi->uv_quant_shift[q] + 0, quant_val);
+    cpi->uv_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
+    cpi->uv_round[q][0] = (qrounding_factor * quant_val) >> 7;
+    cpi->common.uv_dequant[q][0] = quant_val;
     cpi->zrun_zbin_boost_uv[q][0] = (quant_val * zbin_boost[0]) >> 7;
 
+#if CONFIG_ALPHA
+    quant_val = vp9_dc_quant(q, cpi->common.a_dc_delta_q);
+    invert_quant(cpi->a_quant[q] + 0, cpi->a_quant_shift[q] + 0, quant_val);
+    cpi->a_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
+    cpi->a_round[q][0] = (qrounding_factor * quant_val) >> 7;
+    cpi->common.a_dequant[q][0] = quant_val;
+    cpi->zrun_zbin_boost_a[q][0] = (quant_val * zbin_boost[0]) >> 7;
+#endif
+
+    quant_val = vp9_ac_quant(q, 0);
+    cpi->common.y_dequant[q][1] = quant_val;
+    quant_uv_val = vp9_ac_quant(q, cpi->common.uv_ac_delta_q);
+    cpi->common.uv_dequant[q][1] = quant_uv_val;
+#if CONFIG_ALPHA
+    quant_alpha_val = vp9_ac_quant(q, cpi->common.a_ac_delta_q);
+    cpi->common.a_dequant[q][1] = quant_alpha_val;
+#endif
     // all the 4x4 ac values =;
     for (i = 1; i < 16; i++) {
-      int rc = vp9_default_zig_zag1d_4x4[i];
+      int rc = vp9_default_scan_4x4[i];
 
-      quant_val = vp9_ac_yquant(q);
-      invert_quant(cpi->Y1quant[q] + rc, cpi->Y1quant_shift[q] + rc, quant_val);
-      cpi->Y1zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
-      cpi->Y1round[q][rc] = (qrounding_factor * quant_val) >> 7;
-      cpi->common.Y1dequant[q][rc] = quant_val;
-      cpi->zrun_zbin_boost_y1[q][i] =
+      invert_quant(cpi->y_quant[q] + rc, cpi->y_quant_shift[q] + rc, quant_val);
+      cpi->y_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
+      cpi->y_round[q][rc] = (qrounding_factor * quant_val) >> 7;
+      cpi->zrun_zbin_boost_y[q][i] =
           ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);
 
-      quant_val = vp9_ac_uv_quant(q, cpi->common.uvac_delta_q);
-      invert_quant(cpi->UVquant[q] + rc, cpi->UVquant_shift[q] + rc, quant_val);
-      cpi->UVzbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
-      cpi->UVround[q][rc] = (qrounding_factor * quant_val) >> 7;
-      cpi->common.UVdequant[q][rc] = quant_val;
+      invert_quant(cpi->uv_quant[q] + rc, cpi->uv_quant_shift[q] + rc,
+        quant_uv_val);
+      cpi->uv_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7);
+      cpi->uv_round[q][rc] = (qrounding_factor * quant_uv_val) >> 7;
       cpi->zrun_zbin_boost_uv[q][i] =
-          ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);
+          ROUND_POWER_OF_TWO(quant_uv_val * zbin_boost[i], 7);
+
+#if CONFIG_ALPHA
+      invert_quant(cpi->a_quant[q] + rc, cpi->a_quant_shift[q] + rc,
+          quant_alpha_val);
+      cpi->a_zbin[q][rc] =
+          ROUND_POWER_OF_TWO(qzbin_factor * quant_alpha_val, 7);
+      cpi->a_round[q][rc] = (qrounding_factor * quant_alpha_val) >> 7;
+      cpi->zrun_zbin_boost_a[q][i] =
+          ROUND_POWER_OF_TWO(quant_alpha_val * zbin_boost[i], 7);
+#endif
     }
   }
 }
@@ -670,91 +227,63 @@
 
 void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
   int i;
-  int QIndex;
   MACROBLOCKD *xd = &x->e_mbd;
   int zbin_extra;
   int segment_id = xd->mode_info_context->mbmi.segment_id;
+  const int qindex = vp9_get_qindex(xd, segment_id, cpi->common.base_qindex);
 
-  // Select the baseline MB Q index allowing for any segment level change.
-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {
-    // Abs Value
-    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA)
-      QIndex = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
-
-    // Delta Value
-    else {
-      QIndex = cpi->common.base_qindex +
-               vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);
-
-      // Clamp to valid range
-      QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;
-    }
-  } else
-    QIndex = cpi->common.base_qindex;
-
   // Y
-  zbin_extra = (cpi->common.Y1dequant[QIndex][1] *
-                (cpi->zbin_mode_boost +
-                 x->act_zbin_adj)) >> 7;
+  zbin_extra = (cpi->common.y_dequant[qindex][1] *
+                 (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
 
-  for (i = 0; i < 16; i++) {
-    x->block[i].quant = cpi->Y1quant[QIndex];
-    x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];
-    x->block[i].zbin = cpi->Y1zbin[QIndex];
-    x->block[i].round = cpi->Y1round[QIndex];
-    x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex];
-    x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];
-    x->block[i].zbin_extra = (int16_t)zbin_extra;
+  x->plane[0].quant = cpi->y_quant[qindex];
+  x->plane[0].quant_shift = cpi->y_quant_shift[qindex];
+  x->plane[0].zbin = cpi->y_zbin[qindex];
+  x->plane[0].round = cpi->y_round[qindex];
+  x->plane[0].zrun_zbin_boost = cpi->zrun_zbin_boost_y[qindex];
+  x->plane[0].zbin_extra = (int16_t)zbin_extra;
+  x->e_mbd.plane[0].dequant = cpi->common.y_dequant[qindex];
 
-    // Segment skip feature.
-    x->block[i].skip_block =
-      vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
-  }
-
   // UV
-  zbin_extra = (cpi->common.UVdequant[QIndex][1] *
-                (cpi->zbin_mode_boost +
-                 x->act_zbin_adj)) >> 7;
+  zbin_extra = (cpi->common.uv_dequant[qindex][1] *
+                (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
 
-  for (i = 16; i < 24; i++) {
-    x->block[i].quant = cpi->UVquant[QIndex];
-    x->block[i].quant_shift = cpi->UVquant_shift[QIndex];
-    x->block[i].zbin = cpi->UVzbin[QIndex];
-    x->block[i].round = cpi->UVround[QIndex];
-    x->e_mbd.block[i].dequant = cpi->common.UVdequant[QIndex];
-    x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex];
-    x->block[i].zbin_extra = (int16_t)zbin_extra;
-
-    // Segment skip feature.
-    x->block[i].skip_block =
-      vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
+  for (i = 1; i < 3; i++) {
+    x->plane[i].quant = cpi->uv_quant[qindex];
+    x->plane[i].quant_shift = cpi->uv_quant_shift[qindex];
+    x->plane[i].zbin = cpi->uv_zbin[qindex];
+    x->plane[i].round = cpi->uv_round[qindex];
+    x->plane[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[qindex];
+    x->plane[i].zbin_extra = (int16_t)zbin_extra;
+    x->e_mbd.plane[i].dequant = cpi->common.uv_dequant[qindex];
   }
 
+#if CONFIG_ALPHA
+  x->plane[3].quant = cpi->a_quant[qindex];
+  x->plane[3].quant_shift = cpi->a_quant_shift[qindex];
+  x->plane[3].zbin = cpi->a_zbin[qindex];
+  x->plane[3].round = cpi->a_round[qindex];
+  x->plane[3].zrun_zbin_boost = cpi->zrun_zbin_boost_a[qindex];
+  x->plane[3].zbin_extra = (int16_t)zbin_extra;
+  x->e_mbd.plane[3].dequant = cpi->common.a_dequant[qindex];
+#endif
+
+  x->skip_block = vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
+
   /* save this macroblock QIndex for vp9_update_zbin_extra() */
-  x->e_mbd.q_index = QIndex;
+  x->e_mbd.q_index = qindex;
 }
 
 void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
-  int i;
-  int QIndex = x->e_mbd.q_index;
-  int zbin_extra;
+  const int qindex = x->e_mbd.q_index;
+  const int y_zbin_extra = (cpi->common.y_dequant[qindex][1] *
+                (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
+  const int uv_zbin_extra = (cpi->common.uv_dequant[qindex][1] *
+                  (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
 
-  // Y
-  zbin_extra = (cpi->common.Y1dequant[QIndex][1] *
-                (cpi->zbin_mode_boost +
-                 x->act_zbin_adj)) >> 7;
-  for (i = 0; i < 16; i++) {
-    x->block[i].zbin_extra = (int16_t)zbin_extra;
-  }
-
-  // UV
-  zbin_extra = (cpi->common.UVdequant[QIndex][1] *
-                (cpi->zbin_mode_boost +
-                 x->act_zbin_adj)) >> 7;
-
-  for (i = 16; i < 24; i++) {
-    x->block[i].zbin_extra = (int16_t)zbin_extra;
-  }
+  x->plane[0].zbin_extra = (int16_t)y_zbin_extra;
+  x->plane[1].zbin_extra = (int16_t)uv_zbin_extra;
+  x->plane[2].zbin_extra = (int16_t)uv_zbin_extra;
 }
 
 void vp9_frame_init_quantizer(VP9_COMP *cpi) {
@@ -770,15 +299,11 @@
 
   cm->base_qindex = Q;
 
-  // Set lossless mode
-  if (cm->base_qindex <= 4)
-    cm->base_qindex = 0;
-
   // if any of the delta_q values are changing update flag will
   // have to be set.
-  cm->y1dc_delta_q = 0;
-  cm->uvdc_delta_q = 0;
-  cm->uvac_delta_q = 0;
+  cm->y_dc_delta_q = 0;
+  cm->uv_dc_delta_q = 0;
+  cm->uv_ac_delta_q = 0;
 
   // quantizer has to be reinitialized if any delta_q changes.
   // As there are not any here for now this is inactive code.
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -22,46 +22,15 @@
 #define prototype_quantize_mb(sym) \
   void (sym)(MACROBLOCK *x)
 
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/vp9_quantize_x86.h"
-#endif
+void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coefs,
+                  TX_TYPE tx_type);
 
-void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_ix, TX_TYPE type);
-void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx);
-void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2);
-void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type);
-void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type);
-void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx);
-
-void vp9_quantize_mb_4x4(MACROBLOCK *x);
-void vp9_quantize_mb_8x8(MACROBLOCK *x);
-
-void vp9_quantize_mbuv_4x4(MACROBLOCK *x);
-void vp9_quantize_mby_4x4(MACROBLOCK *x);
-
-void vp9_quantize_mby_8x8(MACROBLOCK *x);
-void vp9_quantize_mbuv_8x8(MACROBLOCK *x);
-
-void vp9_quantize_mb_16x16(MACROBLOCK *x);
-void vp9_quantize_mby_16x16(MACROBLOCK *x);
-
-void vp9_quantize_sby_32x32(MACROBLOCK *x);
-void vp9_quantize_sby_16x16(MACROBLOCK *x);
-void vp9_quantize_sby_8x8(MACROBLOCK *x);
-void vp9_quantize_sby_4x4(MACROBLOCK *x);
-void vp9_quantize_sbuv_16x16(MACROBLOCK *x);
-void vp9_quantize_sbuv_8x8(MACROBLOCK *x);
-void vp9_quantize_sbuv_4x4(MACROBLOCK *x);
-
-void vp9_quantize_sb64y_32x32(MACROBLOCK *x);
-void vp9_quantize_sb64y_16x16(MACROBLOCK *x);
-void vp9_quantize_sb64y_8x8(MACROBLOCK *x);
-void vp9_quantize_sb64y_4x4(MACROBLOCK *x);
-void vp9_quantize_sb64uv_32x32(MACROBLOCK *x);
-void vp9_quantize_sb64uv_16x16(MACROBLOCK *x);
-void vp9_quantize_sb64uv_8x8(MACROBLOCK *x);
-void vp9_quantize_sb64uv_4x4(MACROBLOCK *x);
-
+void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2,
+                                     int y_blocks);
+void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
+                                int y_blocks);
+void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
+                                int y_blocks);
 struct VP9_COMP;
 
 extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q);
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -30,16 +30,6 @@
 #define MIN_BPB_FACTOR 0.005
 #define MAX_BPB_FACTOR 50
 
-#ifdef MODE_STATS
-extern unsigned int y_modes[VP9_YMODES];
-extern unsigned int uv_modes[VP9_UV_MODES];
-extern unsigned int b_modes[B_MODE_COUNT];
-
-extern unsigned int inter_y_modes[MB_MODE_COUNT];
-extern unsigned int inter_uv_modes[VP9_UV_MODES];
-extern unsigned int inter_b_modes[B_MODE_COUNT];
-#endif
-
 // Bits Per MB at different Q (Multiplied by 512)
 #define BPER_MB_NORMBITS    9
 
@@ -89,7 +79,7 @@
 // tables if and when things settle down in the experimental bitstream
 double vp9_convert_qindex_to_q(int qindex) {
   // Convert the index to a real Q value (scaled down to match old Q values)
-  return vp9_ac_yquant(qindex) / 4.0;
+  return vp9_ac_quant(qindex, 0) / 4.0;
 }
 
 int vp9_gfboost_qadjust(int qindex) {
@@ -112,7 +102,7 @@
   const double q = vp9_convert_qindex_to_q(qindex);
   int enumerator = frame_type == KEY_FRAME ? 4000000 : 2500000;
 
-  // q based adjustment to baseline enumberator
+  // q based adjustment to baseline enumerator
   enumerator += (int)(enumerator * q) >> 12;
   return (int)(0.5 + (enumerator * correction_factor / q));
 }
@@ -132,52 +122,31 @@
   vp9_copy(cc->nmvcosts,  cpi->mb.nmvcosts);
   vp9_copy(cc->nmvcosts_hp,  cpi->mb.nmvcosts_hp);
 
-  vp9_copy(cc->vp9_mode_contexts, cm->fc.vp9_mode_contexts);
+  vp9_copy(cc->inter_mode_probs, cm->fc.inter_mode_probs);
 
-  vp9_copy(cc->ymode_prob, cm->fc.ymode_prob);
-  vp9_copy(cc->sb_ymode_prob, cm->fc.sb_ymode_prob);
-  vp9_copy(cc->bmode_prob, cm->fc.bmode_prob);
+  vp9_copy(cc->y_mode_prob, cm->fc.y_mode_prob);
   vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);
-  vp9_copy(cc->i8x8_mode_prob, cm->fc.i8x8_mode_prob);
-  vp9_copy(cc->sub_mv_ref_prob, cm->fc.sub_mv_ref_prob);
-  vp9_copy(cc->mbsplit_prob, cm->fc.mbsplit_prob);
+  vp9_copy(cc->partition_prob, cm->fc.partition_prob);
 
-  // Stats
-#ifdef MODE_STATS
-  vp9_copy(cc->y_modes,       y_modes);
-  vp9_copy(cc->uv_modes,      uv_modes);
-  vp9_copy(cc->b_modes,       b_modes);
-  vp9_copy(cc->inter_y_modes,  inter_y_modes);
-  vp9_copy(cc->inter_uv_modes, inter_uv_modes);
-  vp9_copy(cc->inter_b_modes,  inter_b_modes);
-#endif
-
   vp9_copy(cc->segment_pred_probs, cm->segment_pred_probs);
-  vp9_copy(cc->ref_pred_probs_update, cpi->ref_pred_probs_update);
-  vp9_copy(cc->ref_pred_probs, cm->ref_pred_probs);
-  vp9_copy(cc->prob_comppred, cm->prob_comppred);
 
+  vp9_copy(cc->intra_inter_prob, cm->fc.intra_inter_prob);
+  vp9_copy(cc->comp_inter_prob, cm->fc.comp_inter_prob);
+  vp9_copy(cc->single_ref_prob, cm->fc.single_ref_prob);
+  vp9_copy(cc->comp_ref_prob, cm->fc.comp_ref_prob);
+
   vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,
-             cm->last_frame_seg_map, (cm->mb_rows * cm->mb_cols));
+             cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
 
   vp9_copy(cc->last_ref_lf_deltas, xd->last_ref_lf_deltas);
   vp9_copy(cc->last_mode_lf_deltas, xd->last_mode_lf_deltas);
 
-  vp9_copy(cc->coef_probs_4x4, cm->fc.coef_probs_4x4);
-  vp9_copy(cc->coef_probs_8x8, cm->fc.coef_probs_8x8);
-  vp9_copy(cc->coef_probs_16x16, cm->fc.coef_probs_16x16);
-  vp9_copy(cc->coef_probs_32x32, cm->fc.coef_probs_32x32);
+  vp9_copy(cc->coef_probs, cm->fc.coef_probs);
   vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);
-#if CONFIG_COMP_INTERINTRA_PRED
-  cc->interintra_prob = cm->fc.interintra_prob;
-#endif
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_copy(cc->nzc_probs_4x4, cm->fc.nzc_probs_4x4);
-  vp9_copy(cc->nzc_probs_8x8, cm->fc.nzc_probs_8x8);
-  vp9_copy(cc->nzc_probs_16x16, cm->fc.nzc_probs_16x16);
-  vp9_copy(cc->nzc_probs_32x32, cm->fc.nzc_probs_32x32);
-  vp9_copy(cc->nzc_pcat_probs, cm->fc.nzc_pcat_probs);
-#endif
+  vp9_copy(cc->tx_probs_8x8p, cm->fc.tx_probs_8x8p);
+  vp9_copy(cc->tx_probs_16x16p, cm->fc.tx_probs_16x16p);
+  vp9_copy(cc->tx_probs_32x32p, cm->fc.tx_probs_32x32p);
+  vp9_copy(cc->mbskip_probs, cm->fc.mbskip_probs);
 }
 
 void vp9_restore_coding_context(VP9_COMP *cpi) {
@@ -193,53 +162,32 @@
   vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);
   vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);
 
-  vp9_copy(cm->fc.vp9_mode_contexts, cc->vp9_mode_contexts);
+  vp9_copy(cm->fc.inter_mode_probs, cc->inter_mode_probs);
 
-  vp9_copy(cm->fc.ymode_prob, cc->ymode_prob);
-  vp9_copy(cm->fc.sb_ymode_prob, cc->sb_ymode_prob);
-  vp9_copy(cm->fc.bmode_prob, cc->bmode_prob);
-  vp9_copy(cm->fc.i8x8_mode_prob, cc->i8x8_mode_prob);
+  vp9_copy(cm->fc.y_mode_prob, cc->y_mode_prob);
   vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);
-  vp9_copy(cm->fc.sub_mv_ref_prob, cc->sub_mv_ref_prob);
-  vp9_copy(cm->fc.mbsplit_prob, cc->mbsplit_prob);
+  vp9_copy(cm->fc.partition_prob, cc->partition_prob);
 
-  // Stats
-#ifdef MODE_STATS
-  vp9_copy(y_modes, cc->y_modes);
-  vp9_copy(uv_modes, cc->uv_modes);
-  vp9_copy(b_modes, cc->b_modes);
-  vp9_copy(inter_y_modes, cc->inter_y_modes);
-  vp9_copy(inter_uv_modes, cc->inter_uv_modes);
-  vp9_copy(inter_b_modes, cc->inter_b_modes);
-#endif
-
   vp9_copy(cm->segment_pred_probs, cc->segment_pred_probs);
-  vp9_copy(cpi->ref_pred_probs_update, cc->ref_pred_probs_update);
-  vp9_copy(cm->ref_pred_probs, cc->ref_pred_probs);
-  vp9_copy(cm->prob_comppred, cc->prob_comppred);
 
+  vp9_copy(cm->fc.intra_inter_prob, cc->intra_inter_prob);
+  vp9_copy(cm->fc.comp_inter_prob, cc->comp_inter_prob);
+  vp9_copy(cm->fc.single_ref_prob, cc->single_ref_prob);
+  vp9_copy(cm->fc.comp_ref_prob, cc->comp_ref_prob);
+
   vpx_memcpy(cm->last_frame_seg_map,
              cpi->coding_context.last_frame_seg_map_copy,
-             (cm->mb_rows * cm->mb_cols));
+             (cm->mi_rows * cm->mi_cols));
 
   vp9_copy(xd->last_ref_lf_deltas, cc->last_ref_lf_deltas);
   vp9_copy(xd->last_mode_lf_deltas, cc->last_mode_lf_deltas);
 
-  vp9_copy(cm->fc.coef_probs_4x4, cc->coef_probs_4x4);
-  vp9_copy(cm->fc.coef_probs_8x8, cc->coef_probs_8x8);
-  vp9_copy(cm->fc.coef_probs_16x16, cc->coef_probs_16x16);
-  vp9_copy(cm->fc.coef_probs_32x32, cc->coef_probs_32x32);
+  vp9_copy(cm->fc.coef_probs, cc->coef_probs);
   vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);
-#if CONFIG_COMP_INTERINTRA_PRED
-  cm->fc.interintra_prob = cc->interintra_prob;
-#endif
-#if CONFIG_CODE_NONZEROCOUNT
-  vp9_copy(cm->fc.nzc_probs_4x4, cc->nzc_probs_4x4);
-  vp9_copy(cm->fc.nzc_probs_8x8, cc->nzc_probs_8x8);
-  vp9_copy(cm->fc.nzc_probs_16x16, cc->nzc_probs_16x16);
-  vp9_copy(cm->fc.nzc_probs_32x32, cc->nzc_probs_32x32);
-  vp9_copy(cm->fc.nzc_pcat_probs, cc->nzc_pcat_probs);
-#endif
+  vp9_copy(cm->fc.tx_probs_8x8p, cc->tx_probs_8x8p);
+  vp9_copy(cm->fc.tx_probs_16x16p, cc->tx_probs_16x16p);
+  vp9_copy(cm->fc.tx_probs_32x32p, cc->tx_probs_32x32p);
+  vp9_copy(cm->fc.mbskip_probs, cc->mbskip_probs);
 }
 
 void vp9_setup_key_frame(VP9_COMP *cpi) {
@@ -258,12 +206,11 @@
 void vp9_setup_inter_frame(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  if (cm->error_resilient_mode)
+  if (cm->error_resilient_mode || cm->intra_only)
     vp9_setup_past_independence(cm, xd);
 
   assert(cm->frame_context_idx < NUM_FRAME_CONTEXTS);
-  vpx_memcpy(&cm->fc, &cm->frame_contexts[cm->frame_context_idx],
-             sizeof(cm->fc));
+  cm->fc = cm->frame_contexts[cm->frame_context_idx];
 }
 
 static int estimate_bits_at_q(int frame_kind, int q, int mbs,
@@ -300,7 +247,7 @@
 }
 
 
-//  Do the best we can to define the parameteres for the next GF based
+//  Do the best we can to define the parameters for the next GF based
 //  on what information we have available.
 //
 //  In this experimental code only two pass is supported
@@ -358,16 +305,13 @@
           (estimate_bits_at_q(1, q, cpi->common.MBs, 1.0)
            * cpi->last_boost) / 100;
       }
-
     } else {
       // If there is an active ARF at this location use the minimum
-      // bits on this frame even if it is a contructed arf.
+      // bits on this frame even if it is a constructed arf.
       // The active maximum quantizer insures that an appropriate
-      // number of bits will be spent if needed for contstructed ARFs.
+      // number of bits will be spent if needed for constructed ARFs.
       cpi->this_frame_target = 0;
     }
-
-    cpi->current_gf_interval = cpi->frames_till_gf_update_due;
   }
 }
 
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -13,8 +13,8 @@
 #include <math.h>
 #include <limits.h>
 #include <assert.h>
-#include "vp9/common/vp9_pragmas.h"
 
+#include "vp9/common/vp9_pragmas.h"
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_treewriter.h"
 #include "vp9/encoder/vp9_onyx_int.h"
@@ -34,7 +34,6 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/encoder/vp9_encodemv.h"
-
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_entropy.h"
@@ -42,33 +41,17 @@
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_common.h"
 
-#define MAXF(a,b)            (((a) > (b)) ? (a) : (b))
-
 #define INVALID_MV 0x80008000
 
 /* Factor to weigh the rate for switchable interp filters */
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
 
-static const int auto_speed_thresh[17] = {
-  1000,
-  200,
-  150,
-  130,
-  150,
-  125,
-  120,
-  115,
-  115,
-  115,
-  115,
-  115,
-  115,
-  115,
-  115,
-  115,
-  105
-};
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
 
+#define I4X4_PRED 0x8000
+#define SPLITMV 0x10000
+
 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   {ZEROMV,    LAST_FRAME,   NONE},
   {DC_PRED,   INTRA_FRAME,  NONE},
@@ -104,118 +87,63 @@
   {SPLITMV,   GOLDEN_FRAME, NONE},
   {SPLITMV,   ALTREF_FRAME, NONE},
 
-  {B_PRED,    INTRA_FRAME,  NONE},
-  {I8X8_PRED, INTRA_FRAME,  NONE},
+  {I4X4_PRED, INTRA_FRAME,  NONE},
 
   /* compound prediction modes */
-  {ZEROMV,    LAST_FRAME,   GOLDEN_FRAME},
-  {NEARESTMV, LAST_FRAME,   GOLDEN_FRAME},
-  {NEARMV,    LAST_FRAME,   GOLDEN_FRAME},
+  {ZEROMV,    LAST_FRAME,   ALTREF_FRAME},
+  {NEARESTMV, LAST_FRAME,   ALTREF_FRAME},
+  {NEARMV,    LAST_FRAME,   ALTREF_FRAME},
 
-  {ZEROMV,    ALTREF_FRAME, LAST_FRAME},
-  {NEARESTMV, ALTREF_FRAME, LAST_FRAME},
-  {NEARMV,    ALTREF_FRAME, LAST_FRAME},
-
   {ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME},
   {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
   {NEARMV,    GOLDEN_FRAME, ALTREF_FRAME},
 
-  {NEWMV,     LAST_FRAME,   GOLDEN_FRAME},
-  {NEWMV,     ALTREF_FRAME, LAST_FRAME  },
+  {NEWMV,     LAST_FRAME,   ALTREF_FRAME},
   {NEWMV,     GOLDEN_FRAME, ALTREF_FRAME},
 
-  {SPLITMV,   LAST_FRAME,   GOLDEN_FRAME},
-  {SPLITMV,   ALTREF_FRAME, LAST_FRAME  },
+  {SPLITMV,   LAST_FRAME,   ALTREF_FRAME},
   {SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME},
-
-#if CONFIG_COMP_INTERINTRA_PRED
-  /* compound inter-intra prediction */
-  {ZEROMV,    LAST_FRAME,   INTRA_FRAME},
-  {NEARESTMV, LAST_FRAME,   INTRA_FRAME},
-  {NEARMV,    LAST_FRAME,   INTRA_FRAME},
-  {NEWMV,     LAST_FRAME,   INTRA_FRAME},
-
-  {ZEROMV,    GOLDEN_FRAME,   INTRA_FRAME},
-  {NEARESTMV, GOLDEN_FRAME,   INTRA_FRAME},
-  {NEARMV,    GOLDEN_FRAME,   INTRA_FRAME},
-  {NEWMV,     GOLDEN_FRAME,   INTRA_FRAME},
-
-  {ZEROMV,    ALTREF_FRAME,   INTRA_FRAME},
-  {NEARESTMV, ALTREF_FRAME,   INTRA_FRAME},
-  {NEARMV,    ALTREF_FRAME,   INTRA_FRAME},
-  {NEWMV,     ALTREF_FRAME,   INTRA_FRAME},
-#endif
 };
 
-static void fill_token_costs(vp9_coeff_count *c,
-                             vp9_coeff_probs *p,
-                             int block_type_counts) {
-  int i, j, k, l;
+// The baseline rd thresholds for breaking out of the rd loop for
+// certain modes are assumed to be based on 8x8 blocks.
+// This table is used to correct for blocks size.
+// The factors here are << 2 (2 = x0.5, 32 = x8 etc).
+static int rd_thresh_block_size_factor[BLOCK_SIZE_TYPES] =
+  {2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32};
 
-  for (i = 0; i < block_type_counts; i++)
-    for (j = 0; j < REF_TYPES; j++)
-      for (k = 0; k < COEF_BANDS; k++)
-        for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
-          vp9_cost_tokens_skip((int *)(c[i][j][k][l]),
-                               p[i][j][k][l],
-                               vp9_coef_tree);
-        }
-}
+#define BASE_RD_THRESH_FREQ_FACT 16
+#define MAX_RD_THRESH_FREQ_FACT 32
+#define MAX_RD_THRESH_FREQ_INC 1
 
-#if CONFIG_CODE_NONZEROCOUNT
-static void fill_nzc_costs(VP9_COMP *cpi, int block_size) {
-  int nzc_context, r, b, nzc, values;
-  int cost[16];
-  values = block_size * block_size + 1;
-
-  for (nzc_context = 0; nzc_context < MAX_NZC_CONTEXTS; ++nzc_context) {
-    for (r = 0; r < REF_TYPES; ++r) {
-      for (b = 0; b < BLOCK_TYPES; ++b) {
-        unsigned int *nzc_costs;
-        if (block_size == 4) {
-          vp9_cost_tokens(cost,
-                          cpi->common.fc.nzc_probs_4x4[nzc_context][r][b],
-                          vp9_nzc4x4_tree);
-          nzc_costs = cpi->mb.nzc_costs_4x4[nzc_context][r][b];
-        } else if (block_size == 8) {
-          vp9_cost_tokens(cost,
-                          cpi->common.fc.nzc_probs_8x8[nzc_context][r][b],
-                          vp9_nzc8x8_tree);
-          nzc_costs = cpi->mb.nzc_costs_8x8[nzc_context][r][b];
-        } else if (block_size == 16) {
-          vp9_cost_tokens(cost,
-                          cpi->common.fc.nzc_probs_16x16[nzc_context][r][b],
-                          vp9_nzc16x16_tree);
-          nzc_costs = cpi->mb.nzc_costs_16x16[nzc_context][r][b];
-        } else {
-          vp9_cost_tokens(cost,
-                          cpi->common.fc.nzc_probs_32x32[nzc_context][r][b],
-                          vp9_nzc32x32_tree);
-          nzc_costs = cpi->mb.nzc_costs_32x32[nzc_context][r][b];
-        }
-
-        for (nzc = 0; nzc < values; ++nzc) {
-          int e, c, totalcost = 0;
-          c = codenzc(nzc);
-          totalcost = cost[c];
-          if ((e = vp9_extranzcbits[c])) {
-            int x = nzc - vp9_basenzcvalue[c];
-            while (e--) {
-              totalcost += vp9_cost_bit(
-                  cpi->common.fc.nzc_pcat_probs[nzc_context]
-                                               [c - NZC_TOKENS_NOEXTRA][e],
-                  ((x >> e) & 1));
-            }
+static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES],
+                             vp9_coeff_count (*cnoskip)[BLOCK_TYPES],
+                             vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
+  int i, j, k, l;
+  TX_SIZE t;
+  for (t = TX_4X4; t <= TX_32X32; t++)
+    for (i = 0; i < BLOCK_TYPES; i++)
+      for (j = 0; j < REF_TYPES; j++)
+        for (k = 0; k < COEF_BANDS; k++)
+          for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
+            vp9_prob probs[ENTROPY_NODES];
+            vp9_model_to_full_probs(p[t][i][j][k][l], probs);
+            vp9_cost_tokens((int *)cnoskip[t][i][j][k][l], probs,
+                            vp9_coef_tree);
+#if CONFIG_BALANCED_COEFTREE
+            // Replace the eob node prob with a very small value so that the
+            // cost approximately equals the cost without the eob node
+            probs[1] = 1;
+            vp9_cost_tokens((int *)c[t][i][j][k][l], probs, vp9_coef_tree);
+#else
+            vp9_cost_tokens_skip((int *)c[t][i][j][k][l], probs,
+                                 vp9_coef_tree);
+            assert(c[t][i][j][k][l][DCT_EOB_TOKEN] ==
+                   cnoskip[t][i][j][k][l][DCT_EOB_TOKEN]);
+#endif
           }
-          nzc_costs[nzc] = totalcost;
-        }
-      }
-    }
-  }
 }
-#endif
 
-
 static int rd_iifactor[32] =  { 4, 4, 3, 2, 1, 0, 0, 0,
                                 0, 0, 0, 0, 0, 0, 0, 0,
                                 0, 0, 0, 0, 0, 0, 0, 0,
@@ -236,12 +164,12 @@
   for (i = 0; i < QINDEX_RANGE; i++) {
     sad_per_bit16lut[i] =
       (int)((0.0418 * vp9_convert_qindex_to_q(i)) + 2.4107);
-    sad_per_bit4lut[i] = (int)((0.063 * vp9_convert_qindex_to_q(i)) + 2.742);
+    sad_per_bit4lut[i] = (int)(0.063 * vp9_convert_qindex_to_q(i) + 2.742);
   }
 }
 
 static int compute_rd_mult(int qindex) {
-  int q = vp9_dc_quant(qindex, 0);
+  const int q = vp9_dc_quant(qindex, 0);
   return (11 * q * q) >> 2;
 }
 
@@ -252,7 +180,7 @@
 
 
 void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
-  int q, i;
+  int q, i, bsize;
 
   vp9_clear_system_state();  // __asm emms;
 
@@ -260,7 +188,7 @@
   // for key frames, golden frames and arf frames.
   // if (cpi->common.refresh_golden_frame ||
   //     cpi->common.refresh_alt_ref_frame)
-  qindex = (qindex < 0) ? 0 : ((qindex > MAXQ) ? MAXQ : qindex);
+  qindex = clamp(qindex, 0, MAXQ);
 
   cpi->RDMULT = compute_rd_mult(qindex);
   if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
@@ -284,44 +212,56 @@
     cpi->RDDIV = 1;
     cpi->RDMULT /= 100;
 
-    for (i = 0; i < MAX_MODES; i++) {
-      if (cpi->sf.thresh_mult[i] < INT_MAX) {
-        cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q / 100;
-      } else {
-        cpi->rd_threshes[i] = INT_MAX;
+    for (bsize = 0; bsize < BLOCK_SIZE_TYPES; ++bsize) {
+      for (i = 0; i < MAX_MODES; ++i) {
+        // Threshold here seem unecessarily harsh but fine given actual
+        // range of values used for cpi->sf.thresh_mult[]
+        int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
+
+        // *4 relates to the scaling of rd_thresh_block_size_factor[]
+        if ((int64_t)cpi->sf.thresh_mult[i] < thresh_max) {
+          cpi->rd_threshes[bsize][i] =
+            cpi->sf.thresh_mult[i] * q *
+            rd_thresh_block_size_factor[bsize] / (4 * 100);
+        } else {
+          cpi->rd_threshes[bsize][i] = INT_MAX;
+        }
+        cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];
+        cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
       }
-      cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
     }
   } else {
     cpi->RDDIV = 100;
 
-    for (i = 0; i < MAX_MODES; i++) {
-      if (cpi->sf.thresh_mult[i] < (INT_MAX / q)) {
-        cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q;
-      } else {
-        cpi->rd_threshes[i] = INT_MAX;
+    for (bsize = 0; bsize < BLOCK_SIZE_TYPES; ++bsize) {
+      for (i = 0; i < MAX_MODES; i++) {
+        // Threshold here seem unecessarily harsh but fine given actual
+        // range of values used for cpi->sf.thresh_mult[]
+        int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
+
+        if (cpi->sf.thresh_mult[i] < thresh_max) {
+          cpi->rd_threshes[bsize][i] =
+            cpi->sf.thresh_mult[i] * q *
+            rd_thresh_block_size_factor[bsize] / 4;
+        } else {
+          cpi->rd_threshes[bsize][i] = INT_MAX;
+        }
+        cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];
+        cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
       }
-      cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
     }
   }
 
-  fill_token_costs(cpi->mb.token_costs[TX_4X4],
-                   cpi->common.fc.coef_probs_4x4, BLOCK_TYPES);
-  fill_token_costs(cpi->mb.token_costs[TX_8X8],
-                   cpi->common.fc.coef_probs_8x8, BLOCK_TYPES);
-  fill_token_costs(cpi->mb.token_costs[TX_16X16],
-                   cpi->common.fc.coef_probs_16x16, BLOCK_TYPES);
-  fill_token_costs(cpi->mb.token_costs[TX_32X32],
-                   cpi->common.fc.coef_probs_32x32, BLOCK_TYPES);
-#if CONFIG_CODE_NONZEROCOUNT
-  fill_nzc_costs(cpi, 4);
-  fill_nzc_costs(cpi, 8);
-  fill_nzc_costs(cpi, 16);
-  fill_nzc_costs(cpi, 32);
-#endif
+  fill_token_costs(cpi->mb.token_costs,
+                   cpi->mb.token_costs_noskip,
+                   cpi->common.fc.coef_probs);
 
+  for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
+    vp9_cost_tokens(cpi->mb.partition_cost[i],
+                    cpi->common.fc.partition_prob[cpi->common.frame_type][i],
+                    vp9_partition_tree);
+
   /*rough estimate for costing*/
-  cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4;
   vp9_init_mode_costs(cpi);
 
   if (cpi->common.frame_type != KEY_FRAME) {
@@ -345,389 +285,136 @@
   return error;
 }
 
-int vp9_mbblock_error_c(MACROBLOCK *mb) {
-  BLOCK  *be;
-  BLOCKD *bd;
-  int i, j;
-  int berror, error = 0;
-
-  for (i = 0; i < 16; i++) {
-    be = &mb->block[i];
-    bd = &mb->e_mbd.block[i];
-    berror = 0;
-    for (j = 0; j < 16; j++) {
-      int this_diff = be->coeff[j] - bd->dqcoeff[j];
-      berror += this_diff * this_diff;
-    }
-    error += berror;
-  }
-  return error;
-}
-
-int vp9_mbuverror_c(MACROBLOCK *mb) {
-  BLOCK  *be;
-  BLOCKD *bd;
-
-  int i, error = 0;
-
-  for (i = 16; i < 24; i++) {
-    be = &mb->block[i];
-    bd = &mb->e_mbd.block[i];
-
-    error += vp9_block_error_c(be->coeff, bd->dqcoeff, 16);
-  }
-
-  return error;
-}
-
-int vp9_uvsse(MACROBLOCK *x) {
-  uint8_t *uptr, *vptr;
-  uint8_t *upred_ptr = (*(x->block[16].base_src) + x->block[16].src);
-  uint8_t *vpred_ptr = (*(x->block[20].base_src) + x->block[20].src);
-  int uv_stride = x->block[16].src_stride;
-
-  unsigned int sse1 = 0;
-  unsigned int sse2 = 0;
-  int mv_row = x->e_mbd.mode_info_context->mbmi.mv[0].as_mv.row;
-  int mv_col = x->e_mbd.mode_info_context->mbmi.mv[0].as_mv.col;
-  int offset;
-  int pre_stride = x->e_mbd.block[16].pre_stride;
-
-  if (mv_row < 0)
-    mv_row -= 1;
-  else
-    mv_row += 1;
-
-  if (mv_col < 0)
-    mv_col -= 1;
-  else
-    mv_col += 1;
-
-  mv_row /= 2;
-  mv_col /= 2;
-
-  offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
-  uptr = x->e_mbd.pre.u_buffer + offset;
-  vptr = x->e_mbd.pre.v_buffer + offset;
-
-  if ((mv_row | mv_col) & 7) {
-    vp9_sub_pixel_variance8x8(uptr, pre_stride, (mv_col & 7) << 1,
-                              (mv_row & 7) << 1, upred_ptr, uv_stride, &sse2);
-    vp9_sub_pixel_variance8x8(vptr, pre_stride, (mv_col & 7) << 1,
-                              (mv_row & 7) << 1, vpred_ptr, uv_stride, &sse1);
-    sse2 += sse1;
-  } else {
-    vp9_variance8x8(uptr, pre_stride, upred_ptr, uv_stride, &sse2);
-    vp9_variance8x8(vptr, pre_stride, vpred_ptr, uv_stride, &sse1);
-    sse2 += sse1;
-  }
-  return sse2;
-}
-
 static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
-                              int ib, PLANE_TYPE type,
-                              ENTROPY_CONTEXT *a,
-                              ENTROPY_CONTEXT *l,
-                              TX_SIZE tx_size) {
+                              int plane, int block, PLANE_TYPE type,
+                              ENTROPY_CONTEXT *A,
+                              ENTROPY_CONTEXT *L,
+                              TX_SIZE tx_size,
+                              int y_blocks) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   int pt;
-  const int eob = xd->eobs[ib];
   int c = 0;
   int cost = 0, pad;
   const int *scan, *nb;
-  const int16_t *qcoeff_ptr = xd->qcoeff + ib * 16;
-  const int ref = mbmi->ref_frame != INTRA_FRAME;
+  const int eob = xd->plane[plane].eobs[block];
+  const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff,
+                                           block, 16);
+  const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
   unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
       mb->token_costs[tx_size][type][ref];
-  ENTROPY_CONTEXT a_ec, l_ec;
-  ENTROPY_CONTEXT *const a1 = a +
-      sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT);
-  ENTROPY_CONTEXT *const l1 = l +
-      sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT);
+  ENTROPY_CONTEXT above_ec, left_ec;
+  TX_TYPE tx_type = DCT_DCT;
 
-#if CONFIG_CODE_NONZEROCOUNT
-  int nzc_context = vp9_get_nzc_context(cm, xd, ib);
-  unsigned int *nzc_cost;
-#else
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  vp9_prob (*coef_probs)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
-                        [ENTROPY_NODES];
-#endif
+  unsigned int (*token_costs_noskip)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
+      mb->token_costs_noskip[tx_size][type][ref];
+
   int seg_eob, default_eob;
   uint8_t token_cache[1024];
+  const uint8_t * band_translate;
 
   // Check for consistency of tx_size with mode info
+  assert((!type && !plane) || (type && plane));
   if (type == PLANE_TYPE_Y_WITH_DC) {
     assert(xd->mode_info_context->mbmi.txfm_size == tx_size);
   } else {
-    TX_SIZE tx_size_uv = get_uv_tx_size(xd);
+    TX_SIZE tx_size_uv = get_uv_tx_size(mbmi);
     assert(tx_size == tx_size_uv);
   }
 
   switch (tx_size) {
     case TX_4X4: {
-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                              get_tx_type_4x4(xd, ib) : DCT_DCT;
-      a_ec = *a;
-      l_ec = *l;
-#if CONFIG_CODE_NONZEROCOUNT
-      nzc_cost = mb->nzc_costs_4x4[nzc_context][ref][type];
-#else
-      coef_probs = cm->fc.coef_probs_4x4;
-#endif
+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+          get_tx_type_4x4(xd, block) : DCT_DCT;
+      above_ec = A[0] != 0;
+      left_ec = L[0] != 0;
       seg_eob = 16;
-      if (tx_type == ADST_DCT) {
-        scan = vp9_row_scan_4x4;
-      } else if (tx_type == DCT_ADST) {
-        scan = vp9_col_scan_4x4;
-      } else {
-        scan = vp9_default_zig_zag1d_4x4;
-      }
+      scan = get_scan_4x4(tx_type);
+      band_translate = vp9_coefband_trans_4x4;
       break;
     }
     case TX_8X8: {
       const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
-      const int sz = 3 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x;
-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                              get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
-      a_ec = (a[0] + a[1]) != 0;
-      l_ec = (l[0] + l[1]) != 0;
-      if (tx_type == ADST_DCT) {
-        scan = vp9_row_scan_8x8;
-      } else if (tx_type == DCT_ADST) {
-        scan = vp9_col_scan_8x8;
-      } else {
-        scan = vp9_default_zig_zag1d_8x8;
-      }
-#if CONFIG_CODE_NONZEROCOUNT
-      nzc_cost = mb->nzc_costs_8x8[nzc_context][ref][type];
-#else
-      coef_probs = cm->fc.coef_probs_8x8;
-#endif
+      const int sz = 1 + b_width_log2(sb_type);
+      const int x = block & ((1 << sz) - 1), y = block - x;
+      TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+          get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
+      above_ec = (A[0] + A[1]) != 0;
+      left_ec = (L[0] + L[1]) != 0;
+      scan = get_scan_8x8(tx_type);
       seg_eob = 64;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_16X16: {
       const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
-      const int sz = 4 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x;
-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                              get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
-      if (tx_type == ADST_DCT) {
-        scan = vp9_row_scan_16x16;
-      } else if (tx_type == DCT_ADST) {
-        scan = vp9_col_scan_16x16;
-      } else {
-        scan = vp9_default_zig_zag1d_16x16;
-      }
-#if CONFIG_CODE_NONZEROCOUNT
-      nzc_cost = mb->nzc_costs_16x16[nzc_context][ref][type];
-#else
-      coef_probs = cm->fc.coef_probs_16x16;
-#endif
+      const int sz = 2 + b_width_log2(sb_type);
+      const int x = block & ((1 << sz) - 1), y = block - x;
+      TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+          get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
+      scan = get_scan_16x16(tx_type);
       seg_eob = 256;
-      if (type == PLANE_TYPE_UV) {
-        a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
-        l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
-      } else {
-        a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;
-        l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;
-      }
+      above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
+      left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_32X32:
-      scan = vp9_default_zig_zag1d_32x32;
-#if CONFIG_CODE_NONZEROCOUNT
-      nzc_cost = mb->nzc_costs_32x32[nzc_context][ref][type];
-#else
-      coef_probs = cm->fc.coef_probs_32x32;
-#endif
+      scan = vp9_default_scan_32x32;
       seg_eob = 1024;
-      if (type == PLANE_TYPE_UV) {
-        ENTROPY_CONTEXT *a2, *a3, *l2, *l3;
-        a2 = a1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-        a3 = a2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-        l2 = l1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-        l3 = l2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-        a_ec = (a[0] + a[1] + a1[0] + a1[1] +
-                a2[0] + a2[1] + a3[0] + a3[1]) != 0;
-        l_ec = (l[0] + l[1] + l1[0] + l1[1] +
-                l2[0] + l2[1] + l3[0] + l3[1]) != 0;
-      } else {
-        a_ec = (a[0] + a[1] + a[2] + a[3] +
-                a1[0] + a1[1] + a1[2] + a1[3]) != 0;
-        l_ec = (l[0] + l[1] + l[2] + l[3] +
-                l1[0] + l1[1] + l1[2] + l1[3]) != 0;
-      }
+      above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
+      left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     default:
       abort();
       break;
   }
+  assert(eob <= seg_eob);
 
-  VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);
+  pt = combine_entropy_contexts(above_ec, left_ec);
   nb = vp9_get_coef_neighbors_handle(scan, &pad);
   default_eob = seg_eob;
 
-#if CONFIG_CODE_NONZEROCOUNT == 0
   if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
     seg_eob = 0;
-#endif
 
+  /* sanity check to ensure that we do not have spurious non-zero q values */
+  if (eob < seg_eob)
+    assert(qcoeff_ptr[scan[eob]] == 0);
+
   {
-#if CONFIG_CODE_NONZEROCOUNT
-    int nzc = 0;
-#endif
-    for (; c < eob; c++) {
+    for (c = 0; c < eob; c++) {
       int v = qcoeff_ptr[scan[c]];
-      int t = vp9_dct_value_tokens_ptr[v].Token;
-#if CONFIG_CODE_NONZEROCOUNT
-      nzc += (v != 0);
-#endif
-      token_cache[c] = t;
-      cost += token_costs[get_coef_band(scan, tx_size, c)][pt][t];
-      cost += vp9_dct_value_cost_ptr[v];
-#if !CONFIG_CODE_NONZEROCOUNT
-      if (!c || token_cache[c - 1])
-        cost += vp9_cost_bit(coef_probs[type][ref]
-                                       [get_coef_band(scan, tx_size, c)]
-                                       [pt][0], 1);
-#endif
-      pt = vp9_get_coef_context(scan, nb, pad, token_cache, c + 1, default_eob);
+      int t = vp9_dct_value_tokens_ptr[v].token;
+      int band = get_coef_band(band_translate, c);
+      if (c)
+        pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
+
+      if (!c || token_cache[scan[c - 1]])  // do not skip eob
+        cost += token_costs_noskip[band][pt][t] + vp9_dct_value_cost_ptr[v];
+      else
+        cost += token_costs[band][pt][t] + vp9_dct_value_cost_ptr[v];
+      token_cache[scan[c]] = vp9_pt_energy_class[t];
     }
-#if CONFIG_CODE_NONZEROCOUNT
-    cost += nzc_cost[nzc];
-#else
-    if (c < seg_eob)
-      cost += mb->token_costs[tx_size][type][ref]
-                             [get_coef_band(scan, tx_size, c)]
-                             [pt][DCT_EOB_TOKEN];
-#endif
+    if (c < seg_eob) {
+      if (c)
+        pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
+      cost += mb->token_costs_noskip[tx_size][type][ref]
+          [get_coef_band(band_translate, c)]
+          [pt][DCT_EOB_TOKEN];
+    }
   }
 
   // is eob first coefficient;
-  pt = (c > 0);
-  *a = *l = pt;
-  if (tx_size >= TX_8X8) {
-    a[1] = l[1] = pt;
-    if (tx_size >= TX_16X16) {
-      if (type == PLANE_TYPE_UV) {
-        a1[0] = a1[1] = l1[0] = l1[1] = pt;
-      } else {
-        a[2] = a[3] = l[2] = l[3] = pt;
-        if (tx_size >= TX_32X32) {
-          a1[0] = a1[1] = a1[2] = a1[3] = pt;
-          l1[0] = l1[1] = l1[2] = l1[3] = pt;
-        }
-      }
-    }
+  for (pt = 0; pt < (1 << tx_size); pt++) {
+    A[pt] = L[pt] = c > 0;
   }
-  return cost;
-}
 
-static int rdcost_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *mb) {
-  int cost = 0;
-  int b;
-  MACROBLOCKD *xd = &mb->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(t_left));
-
-  for (b = 0; b < 16; b++)
-    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_Y_WITH_DC,
-                        ta + vp9_block2above[TX_4X4][b],
-                        tl + vp9_block2left[TX_4X4][b],
-                        TX_4X4);
-
   return cost;
 }
 
-static void macro_block_yrd_4x4(VP9_COMMON *const cm,
-                                MACROBLOCK *mb,
-                                int *rate,
-                                int *distortion,
-                                int *skippable) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-  vp9_transform_mby_4x4(mb);
-  vp9_quantize_mby_4x4(mb);
-
-  *distortion = vp9_mbblock_error(mb) >> 2;
-  *rate = rdcost_mby_4x4(cm, mb);
-  *skippable = vp9_mby_is_skippable_4x4(xd);
-}
-
-static int rdcost_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *mb) {
-  int cost = 0;
-  int b;
-  MACROBLOCKD *xd = &mb->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left,  xd->left_context, sizeof(t_left));
-
-  for (b = 0; b < 16; b += 4)
-    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_Y_WITH_DC,
-                        ta + vp9_block2above[TX_8X8][b],
-                        tl + vp9_block2left[TX_8X8][b],
-                        TX_8X8);
-
-  return cost;
-}
-
-static void macro_block_yrd_8x8(VP9_COMMON *const cm,
-                                MACROBLOCK *mb,
-                                int *rate,
-                                int *distortion,
-                                int *skippable) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-  vp9_transform_mby_8x8(mb);
-  vp9_quantize_mby_8x8(mb);
-
-  *distortion = vp9_mbblock_error(mb) >> 2;
-  *rate = rdcost_mby_8x8(cm, mb);
-  *skippable = vp9_mby_is_skippable_8x8(xd);
-}
-
-static int rdcost_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *mb) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(t_left));
-
-  return cost_coeffs(cm, mb, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16);
-}
-
-static void macro_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *mb,
-                                  int *rate, int *distortion, int *skippable) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_16X16;
-  vp9_transform_mby_16x16(mb);
-  vp9_quantize_mby_16x16(mb);
-  // TODO(jingning) is it possible to quickly determine whether to force
-  //                trailing coefficients to be zero, instead of running trellis
-  //                optimization in the rate-distortion optimization loop?
-  if (mb->optimize &&
-      xd->mode_info_context->mbmi.mode < I8X8_PRED)
-    vp9_optimize_mby_16x16(cm, mb);
-
-  *distortion = vp9_mbblock_error(mb) >> 2;
-  *rate = rdcost_mby_16x16(cm, mb);
-  *skippable = vp9_mby_is_skippable_16x16(xd);
-}
-
 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
                                      int (*r)[2], int *rate,
                                      int *d, int *distortion,
@@ -737,41 +424,34 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  vp9_prob skip_prob = cm->mb_no_coeff_skip ?
-                       vp9_get_pred_prob(cm, xd, PRED_MBSKIP) : 128;
+  vp9_prob skip_prob = vp9_get_pred_prob(cm, xd, PRED_MBSKIP);
   int64_t rd[TX_SIZE_MAX_SB][2];
   int n, m;
+  int s0, s1;
 
+  const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE);
+
   for (n = TX_4X4; n <= max_txfm_size; n++) {
     r[n][1] = r[n][0];
     for (m = 0; m <= n - (n == max_txfm_size); m++) {
       if (m == n)
-        r[n][1] += vp9_cost_zero(cm->prob_tx[m]);
+        r[n][1] += vp9_cost_zero(tx_probs[m]);
       else
-        r[n][1] += vp9_cost_one(cm->prob_tx[m]);
+        r[n][1] += vp9_cost_one(tx_probs[m]);
     }
   }
 
-  if (cm->mb_no_coeff_skip) {
-    int s0, s1;
+  assert(skip_prob > 0);
+  s0 = vp9_cost_bit(skip_prob, 0);
+  s1 = vp9_cost_bit(skip_prob, 1);
 
-    assert(skip_prob > 0);
-    s0 = vp9_cost_bit(skip_prob, 0);
-    s1 = vp9_cost_bit(skip_prob, 1);
-
-    for (n = TX_4X4; n <= max_txfm_size; n++) {
-      if (s[n]) {
-        rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
-      } else {
-        rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
-        rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
-      }
+  for (n = TX_4X4; n <= max_txfm_size; n++) {
+    if (s[n]) {
+      rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
+    } else {
+      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
+      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
     }
-  } else {
-    for (n = TX_4X4; n <= max_txfm_size; n++) {
-      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0], d[n]);
-      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1], d[n]);
-    }
   }
 
   if (max_txfm_size == TX_32X32 &&
@@ -780,17 +460,19 @@
         rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
         rd[TX_32X32][1] < rd[TX_4X4][1]))) {
     mbmi->txfm_size = TX_32X32;
-  } else if ( cm->txfm_mode == ALLOW_16X16 ||
-             (max_txfm_size == TX_16X16 && cm->txfm_mode == ALLOW_32X32) ||
-             (cm->txfm_mode == TX_MODE_SELECT &&
-              rd[TX_16X16][1] < rd[TX_8X8][1] &&
-              rd[TX_16X16][1] < rd[TX_4X4][1])) {
+  } else if (max_txfm_size >= TX_16X16 &&
+             (cm->txfm_mode == ALLOW_16X16 ||
+              cm->txfm_mode == ALLOW_32X32 ||
+              (cm->txfm_mode == TX_MODE_SELECT &&
+               rd[TX_16X16][1] < rd[TX_8X8][1] &&
+               rd[TX_16X16][1] < rd[TX_4X4][1]))) {
     mbmi->txfm_size = TX_16X16;
   } else if (cm->txfm_mode == ALLOW_8X8 ||
+             cm->txfm_mode == ALLOW_16X16 ||
+             cm->txfm_mode == ALLOW_32X32 ||
            (cm->txfm_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
     mbmi->txfm_size = TX_8X8;
   } else {
-    assert(cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT);
     mbmi->txfm_size = TX_4X4;
   }
 
@@ -800,13 +482,14 @@
 
   txfm_cache[ONLY_4X4] = rd[TX_4X4][0];
   txfm_cache[ALLOW_8X8] = rd[TX_8X8][0];
-  txfm_cache[ALLOW_16X16] = rd[TX_16X16][0];
-  txfm_cache[ALLOW_32X32] = rd[max_txfm_size][0];
+  txfm_cache[ALLOW_16X16] = rd[MIN(max_txfm_size, TX_16X16)][0];
+  txfm_cache[ALLOW_32X32] = rd[MIN(max_txfm_size, TX_32X32)][0];
   if (max_txfm_size == TX_32X32 &&
       rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
       rd[TX_32X32][1] < rd[TX_4X4][1])
     txfm_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
-  else if (rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])
+  else if (max_txfm_size >= TX_16X16 &&
+           rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])
     txfm_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
   else
     txfm_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?
@@ -813,41 +496,14 @@
                                  rd[TX_4X4][1] : rd[TX_8X8][1];
 }
 
-static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                            int *distortion, int *skippable,
-                            int64_t txfm_cache[NB_TXFM_MODES]) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int r[TX_SIZE_MAX_MB][2], d[TX_SIZE_MAX_MB], s[TX_SIZE_MAX_MB];
-
-  vp9_subtract_mby(x->src_diff, *(x->block[0].base_src), xd->predictor,
-                   x->block[0].src_stride);
-
-  macro_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
-  macro_block_yrd_8x8(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);
-  macro_block_yrd_4x4(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);
-
-  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skippable,
-                           txfm_cache, TX_16X16);
-}
-
-static void copy_predictor(uint8_t *dst, const uint8_t *predictor) {
-  const unsigned int *p = (const unsigned int *)predictor;
-  unsigned int *d = (unsigned int *)dst;
-  d[0] = p[0];
-  d[4] = p[4];
-  d[8] = p[8];
-  d[12] = p[12];
-}
-
-static int vp9_sb_block_error_c(int16_t *coeff, int16_t *dqcoeff,
-                                int block_size, int shift) {
+static int block_error(int16_t *coeff, int16_t *dqcoeff,
+                       int block_size, int shift) {
   int i;
   int64_t error = 0;
 
   for (i = 0; i < block_size; i++) {
-    unsigned int this_diff = coeff[i] - dqcoeff[i];
-    error += this_diff * this_diff;
+    int this_diff = coeff[i] - dqcoeff[i];
+    error += (unsigned)this_diff * this_diff;
   }
   error >>= shift;
 
@@ -854,383 +510,226 @@
   return error > INT_MAX ? INT_MAX : (int)error;
 }
 
-static int rdcost_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
-  int cost = 0, b;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
-
-  for (b = 0; b < 64; b++)
-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
-                        ta + vp9_block2above_sb[TX_4X4][b],
-                        tl + vp9_block2left_sb[TX_4X4][b], TX_4X4);
-
-  return cost;
+static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
+  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
+  return block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
+                     16 << (bwl + bhl), shift);
 }
 
-static void super_block_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                                int *rate, int *distortion, int *skippable) {
-  MACROBLOCKD *const xd = &x->e_mbd;
+static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
+  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
+  int64_t sum = 0;
+  int plane;
 
-  xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-  vp9_transform_sby_4x4(x);
-  vp9_quantize_sby_4x4(x);
-
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2);
-  *rate       = rdcost_sby_4x4(cm, x);
-  *skippable  = vp9_sby_is_skippable_4x4(xd);
+  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
+    const int subsampling = x->e_mbd.plane[plane].subsampling_x +
+                            x->e_mbd.plane[plane].subsampling_y;
+    sum += block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
+                       16 << (bwl + bhl - subsampling), 0);
+  }
+  sum >>= shift;
+  return sum > INT_MAX ? INT_MAX : (int)sum;
 }
 
-static int rdcost_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
-  int cost = 0, b;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
+struct rdcost_block_args {
+  VP9_COMMON *cm;
+  MACROBLOCK *x;
+  ENTROPY_CONTEXT t_above[16];
+  ENTROPY_CONTEXT t_left[16];
+  TX_SIZE tx_size;
+  int bw;
+  int bh;
+  int cost;
+};
 
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
+static void rdcost_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                         int ss_txfrm_size, void *arg) {
+  struct rdcost_block_args* args = arg;
+  int x_idx, y_idx;
+  MACROBLOCKD * const xd = &args->x->e_mbd;
 
-  for (b = 0; b < 64; b += 4)
-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
-                        ta + vp9_block2above_sb[TX_8X8][b],
-                        tl + vp9_block2left_sb[TX_8X8][b], TX_8X8);
+  txfrm_block_to_raster_xy(xd, bsize, plane, block, args->tx_size * 2, &x_idx,
+                           &y_idx);
 
-  return cost;
+  args->cost += cost_coeffs(args->cm, args->x, plane, block,
+                            xd->plane[plane].plane_type, args->t_above + x_idx,
+                            args->t_left + y_idx, args->tx_size,
+                            args->bw * args->bh);
 }
 
-static void super_block_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                                int *rate, int *distortion, int *skippable) {
-  MACROBLOCKD *const xd = &x->e_mbd;
+static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane,
+                        BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD * const xd = &x->e_mbd;
+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
+  const int bw = 1 << bwl, bh = 1 << bhl;
+  struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh, 0 };
 
-  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-  vp9_transform_sby_8x8(x);
-  vp9_quantize_sby_8x8(x);
+  vpx_memcpy(&args.t_above, xd->plane[plane].above_context,
+             sizeof(ENTROPY_CONTEXT) * bw);
+  vpx_memcpy(&args.t_left, xd->plane[plane].left_context,
+             sizeof(ENTROPY_CONTEXT) * bh);
 
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2);
-  *rate       = rdcost_sby_8x8(cm, x);
-  *skippable  = vp9_sby_is_skippable_8x8(xd);
+  foreach_transformed_block_in_plane(xd, bsize, plane, rdcost_block, &args);
+
+  return args.cost;
 }
 
-static int rdcost_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
-  int cost = 0, b;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
+static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
+                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+  int cost = 0, plane;
 
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
-
-  for (b = 0; b < 64; b += 16)
-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
-                        ta + vp9_block2above_sb[TX_16X16][b],
-                        tl + vp9_block2left_sb[TX_16X16][b], TX_16X16);
-
+  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
+    cost += rdcost_plane(cm, x, plane, bsize, tx_size);
+  }
   return cost;
 }
 
-static void super_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                                  int *rate, int *distortion, int *skippable) {
+static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
+                                     int *rate, int *distortion, int *skippable,
+                                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  xd->mode_info_context->mbmi.txfm_size = tx_size;
 
-  xd->mode_info_context->mbmi.txfm_size = TX_16X16;
-  vp9_transform_sby_16x16(x);
-  vp9_quantize_sby_16x16(x);
+  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
+    vp9_encode_intra_block_y(cm, x, bsize);
+  else
+    vp9_xform_quant_sby(cm, x, bsize);
 
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2);
-  *rate       = rdcost_sby_16x16(cm, x);
-  *skippable  = vp9_sby_is_skippable_16x16(xd);
+  *distortion = block_error_sby(x, bsize, tx_size == TX_32X32 ? 0 : 2);
+  *rate       = rdcost_plane(cm, x, 0, bsize, tx_size);
+  *skippable  = vp9_sby_is_skippable(xd, bsize);
 }
 
-static int rdcost_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {
-  MACROBLOCKD * const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
-
-  return cost_coeffs(cm, x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_32X32);
-}
-
-static void super_block_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                                  int *rate, int *distortion, int *skippable) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_32X32;
-  vp9_transform_sby_32x32(x);
-  vp9_quantize_sby_32x32(x);
-
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 0);
-  *rate       = rdcost_sby_32x32(cm, x);
-  *skippable  = vp9_sby_is_skippable_32x32(xd);
-}
-
 static void super_block_yrd(VP9_COMP *cpi,
                             MACROBLOCK *x, int *rate, int *distortion,
-                            int *skip,
+                            int *skip, BLOCK_SIZE_TYPE bs,
                             int64_t txfm_cache[NB_TXFM_MODES]) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
   int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];
-  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
-  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
-  vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride);
-  super_block_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
-  super_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
-  super_block_yrd_8x8(cm, x,   &r[TX_8X8][0],   &d[TX_8X8],   &s[TX_8X8]);
-  super_block_yrd_4x4(cm, x,   &r[TX_4X4][0],   &d[TX_4X4],   &s[TX_4X4]);
+  assert(bs == mbmi->sb_type);
+  if (mbmi->ref_frame[0] > INTRA_FRAME)
+    vp9_subtract_sby(x, bs);
 
-  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
-                           TX_SIZE_MAX_SB - 1);
-}
+  if (cpi->speed > 4) {
+    if (bs >= BLOCK_SIZE_SB32X32) {
+      mbmi->txfm_size = TX_32X32;
+    } else if (bs >= BLOCK_SIZE_MB16X16) {
+      mbmi->txfm_size = TX_16X16;
+    } else if (bs >= BLOCK_SIZE_SB8X8) {
+      mbmi->txfm_size = TX_8X8;
+    } else {
+      mbmi->txfm_size = TX_4X4;
+    }
+    vpx_memset(txfm_cache, 0, NB_TXFM_MODES * sizeof(int64_t));
+    super_block_yrd_for_txfm(cm, x, rate, distortion, skip, bs,
+                             mbmi->txfm_size);
+    return;
+  }
+  if (bs >= BLOCK_SIZE_SB32X32)
+    super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
+                             bs, TX_32X32);
+  if (bs >= BLOCK_SIZE_MB16X16)
+    super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
+                             bs, TX_16X16);
+  super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], bs,
+                           TX_8X8);
+  super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], bs,
+                           TX_4X4);
 
-static int rdcost_sb64y_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {
-  int cost = 0, b;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
-
-  for (b = 0; b < 256; b++)
-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
-                        ta + vp9_block2above_sb64[TX_4X4][b],
-                        tl + vp9_block2left_sb64[TX_4X4][b], TX_4X4);
-
-  return cost;
+  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
+                           skip, txfm_cache,
+                           TX_32X32 - (bs < BLOCK_SIZE_SB32X32)
+                           - (bs < BLOCK_SIZE_MB16X16));
 }
 
-static void super_block64_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                                  int *rate, int *distortion, int *skippable) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-  vp9_transform_sb64y_4x4(x);
-  vp9_quantize_sb64y_4x4(x);
-
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2);
-  *rate       = rdcost_sb64y_4x4(cm, x);
-  *skippable  = vp9_sb64y_is_skippable_4x4(xd);
-}
-
-static int rdcost_sb64y_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {
-  int cost = 0, b;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
-
-  for (b = 0; b < 256; b += 4)
-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
-                        ta + vp9_block2above_sb64[TX_8X8][b],
-                        tl + vp9_block2left_sb64[TX_8X8][b], TX_8X8);
-
-  return cost;
-}
-
-static void super_block64_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                                  int *rate, int *distortion, int *skippable) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-  vp9_transform_sb64y_8x8(x);
-  vp9_quantize_sb64y_8x8(x);
-
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2);
-  *rate       = rdcost_sb64y_8x8(cm, x);
-  *skippable  = vp9_sb64y_is_skippable_8x8(xd);
-}
-
-static int rdcost_sb64y_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {
-  int cost = 0, b;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
-
-  for (b = 0; b < 256; b += 16)
-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
-                        ta + vp9_block2above_sb64[TX_16X16][b],
-                        tl + vp9_block2left_sb64[TX_16X16][b], TX_16X16);
-
-  return cost;
-}
-
-static void super_block64_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                                    int *rate, int *distortion,
-                                    int *skippable) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_16X16;
-  vp9_transform_sb64y_16x16(x);
-  vp9_quantize_sb64y_16x16(x);
-
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2);
-  *rate       = rdcost_sb64y_16x16(cm, x);
-  *skippable  = vp9_sb64y_is_skippable_16x16(xd);
-}
-
-static int rdcost_sb64y_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {
-  int cost = 0, b;
-  MACROBLOCKD * const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
-  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;
-  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));
-  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));
-
-  for (b = 0; b < 256; b += 64)
-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
-                        ta + vp9_block2above_sb64[TX_32X32][b],
-                        tl + vp9_block2left_sb64[TX_32X32][b], TX_32X32);
-
-  return cost;
-}
-
-static void super_block64_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                                    int *rate, int *distortion,
-                                    int *skippable) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_32X32;
-  vp9_transform_sb64y_32x32(x);
-  vp9_quantize_sb64y_32x32(x);
-
-  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 0);
-  *rate       = rdcost_sb64y_32x32(cm, x);
-  *skippable  = vp9_sb64y_is_skippable_32x32(xd);
-}
-
-static void super_block_64_yrd(VP9_COMP *cpi,
-                               MACROBLOCK *x, int *rate, int *distortion,
-                               int *skip,
-                               int64_t txfm_cache[NB_TXFM_MODES]) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];
-  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
-  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
-
-  vp9_subtract_sb64y_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride);
-  super_block64_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
-  super_block64_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
-  super_block64_yrd_8x8(cm, x,   &r[TX_8X8][0],   &d[TX_8X8],   &s[TX_8X8]);
-  super_block64_yrd_4x4(cm, x,   &r[TX_4X4][0],   &d[TX_4X4],   &s[TX_4X4]);
-
-  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
-                           TX_SIZE_MAX_SB - 1);
-}
-
-static void copy_predictor_8x8(uint8_t *dst, const uint8_t *predictor) {
-  const unsigned int *p = (const unsigned int *)predictor;
-  unsigned int *d = (unsigned int *)dst;
-  d[0] = p[0];
-  d[1] = p[1];
-  d[4] = p[4];
-  d[5] = p[5];
-  d[8] = p[8];
-  d[9] = p[9];
-  d[12] = p[12];
-  d[13] = p[13];
-  d[16] = p[16];
-  d[17] = p[17];
-  d[20] = p[20];
-  d[21] = p[21];
-  d[24] = p[24];
-  d[25] = p[25];
-  d[28] = p[28];
-  d[29] = p[29];
-}
-
-static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
-                                     BLOCKD *b, B_PREDICTION_MODE *best_mode,
+static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
+                                     MB_PREDICTION_MODE *best_mode,
                                      int *bmode_costs,
                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                                      int *bestrate, int *bestratey,
-                                     int *bestdistortion) {
-  B_PREDICTION_MODE mode;
+                                     int *bestdistortion,
+                                     BLOCK_SIZE_TYPE bsize) {
+  MB_PREDICTION_MODE mode;
   MACROBLOCKD *xd = &x->e_mbd;
   int64_t best_rd = INT64_MAX;
   int rate = 0;
   int distortion;
   VP9_COMMON *const cm = &cpi->common;
+  const int src_stride = x->plane[0].src.stride;
+  uint8_t *src, *dst;
+  int16_t *src_diff, *coeff;
 
-  ENTROPY_CONTEXT ta = *a, tempa = *a;
-  ENTROPY_CONTEXT tl = *l, templ = *l;
+  ENTROPY_CONTEXT ta[2], tempa[2];
+  ENTROPY_CONTEXT tl[2], templ[2];
   TX_TYPE tx_type = DCT_DCT;
   TX_TYPE best_tx_type = DCT_DCT;
-  /*
-   * The predictor buffer is a 2d buffer with a stride of 16.  Create
-   * a temp buffer that meets the stride requirements, but we are only
-   * interested in the left 4x4 block
-   * */
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, best_predictor, 16 * 4);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, best_dqcoeff, 16);
+  int bw = 1 << b_width_log2(bsize);
+  int bh = 1 << b_height_log2(bsize);
+  int idx, idy, block;
+  DECLARE_ALIGNED(16, int16_t, best_dqcoeff[4][16]);
 
-#if CONFIG_NEWBINTRAMODES
-  b->bmi.as_mode.context = vp9_find_bpred_context(xd, b);
-#endif
+  assert(ib < 4);
+
+  vpx_memcpy(ta, a, sizeof(ta));
+  vpx_memcpy(tl, l, sizeof(tl));
   xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-  for (mode = B_DC_PRED; mode < LEFT4X4; mode++) {
+
+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     int64_t this_rd;
-    int ratey;
+    int ratey = 0;
 
-#if CONFIG_NEWBINTRAMODES
-    if (xd->frame_type == KEY_FRAME) {
-      if (mode == B_CONTEXT_PRED) continue;
-    } else {
-      if (mode >= B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS &&
-          mode < B_CONTEXT_PRED)
-        continue;
-    }
-#endif
-
-    b->bmi.as_mode.first = mode;
-#if CONFIG_NEWBINTRAMODES
-    rate = bmode_costs[
-        mode == B_CONTEXT_PRED ? mode - CONTEXT_PRED_REPLACEMENTS : mode];
-#else
     rate = bmode_costs[mode];
-#endif
+    distortion = 0;
 
-    vp9_intra4x4_predict(xd, b, mode, b->predictor);
-    vp9_subtract_b(be, b, 16);
+    vpx_memcpy(tempa, ta, sizeof(ta));
+    vpx_memcpy(templ, tl, sizeof(tl));
 
-    b->bmi.as_mode.first = mode;
-    tx_type = get_tx_type_4x4(xd, be - x->block);
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
-      vp9_ht_quantize_b_4x4(x, be - x->block, tx_type);
-    } else {
-      x->fwd_txm4x4(be->src_diff, be->coeff, 32);
-      x->quantize_b_4x4(x, be - x->block);
-    }
+    for (idy = 0; idy < bh; ++idy) {
+      for (idx = 0; idx < bw; ++idx) {
+        block = ib + idy * 2 + idx;
+        xd->mode_info_context->bmi[block].as_mode.first = mode;
+        src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
+                                        x->plane[0].src.buf, src_stride);
+        src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, block,
+                                             x->plane[0].src_diff);
+        coeff = BLOCK_OFFSET(x->plane[0].coeff, block, 16);
+        dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
+                                        xd->plane[0].dst.buf,
+                                        xd->plane[0].dst.stride);
+        vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, mode,
+                             dst, xd->plane[0].dst.stride);
+        vp9_subtract_block(4, 4, src_diff, 8,
+                           src, src_stride,
+                           dst, xd->plane[0].dst.stride);
 
-    tempa = ta;
-    templ = tl;
+        tx_type = get_tx_type_4x4(xd, block);
+        if (tx_type != DCT_DCT) {
+          vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
+          x->quantize_b_4x4(x, block, tx_type, 16);
+        } else {
+          x->fwd_txm4x4(src_diff, coeff, 16);
+          x->quantize_b_4x4(x, block, tx_type, 16);
+        }
 
-    ratey = cost_coeffs(cm, x, b - xd->block,
-                        PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);
-    rate += ratey;
-    distortion = vp9_block_error(be->coeff, b->dqcoeff, 16) >> 2;
+        ratey += cost_coeffs(cm, x, 0, block, PLANE_TYPE_Y_WITH_DC,
+                             tempa + idx, templ + idy, TX_4X4, 16);
+        distortion += vp9_block_error(coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff,
+                                                         block, 16), 16) >> 2;
 
+        if (best_tx_type != DCT_DCT)
+          vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
+                               dst, xd->plane[0].dst.stride, best_tx_type);
+        else
+          xd->inv_txm4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
+                             dst, xd->plane[0].dst.stride);
+      }
+    }
+
+    rate += ratey;
     this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
     if (this_rd < best_rd) {
@@ -1240,21 +739,38 @@
       best_rd = this_rd;
       *best_mode = mode;
       best_tx_type = tx_type;
-      *a = tempa;
-      *l = templ;
-      copy_predictor(best_predictor, b->predictor);
-      vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
+      vpx_memcpy(a, tempa, sizeof(tempa));
+      vpx_memcpy(l, templ, sizeof(templ));
+      for (idy = 0; idy < bh; ++idy) {
+        for (idx = 0; idx < bw; ++idx) {
+          block = ib + idy * 2 + idx;
+          vpx_memcpy(best_dqcoeff[idy * 2 + idx],
+                     BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
+                     sizeof(best_dqcoeff[0]));
+        }
+      }
     }
   }
-  b->bmi.as_mode.first = (B_PREDICTION_MODE)(*best_mode);
 
-  // inverse transform
-  if (best_tx_type != DCT_DCT)
-    vp9_short_iht4x4(best_dqcoeff, b->diff, 16, best_tx_type);
-  else
-    xd->inv_txm4x4(best_dqcoeff, b->diff, 32);
+  for (idy = 0; idy < bh; ++idy) {
+    for (idx = 0; idx < bw; ++idx) {
+      block = ib + idy * 2 + idx;
+      xd->mode_info_context->bmi[block].as_mode.first = *best_mode;
+      dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
+                                      xd->plane[0].dst.buf,
+                                      xd->plane[0].dst.stride);
 
-  vp9_recon_b(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+      vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, *best_mode,
+                           dst, xd->plane[0].dst.stride);
+      // inverse transform
+      if (best_tx_type != DCT_DCT)
+        vp9_short_iht4x4_add(best_dqcoeff[idy * 2 + idx], dst,
+                             xd->plane[0].dst.stride, best_tx_type);
+      else
+        xd->inv_txm4x4_add(best_dqcoeff[idy * 2 + idx], dst,
+                           xd->plane[0].dst.stride);
+    }
+  }
 
   return best_rd;
 }
@@ -1262,60 +778,57 @@
 static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
                                          int *Rate, int *rate_y,
                                          int *Distortion, int64_t best_rd) {
-  int i;
+  int i, j;
   MACROBLOCKD *const xd = &mb->e_mbd;
-  int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
+  BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+  int bw = 1 << b_width_log2(bsize);
+  int bh = 1 << b_height_log2(bsize);
+  int idx, idy;
+  int cost = 0;
   int distortion = 0;
   int tot_rate_y = 0;
   int64_t total_rd = 0;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta, *tl;
+  ENTROPY_CONTEXT t_above[4], t_left[4];
   int *bmode_costs;
+  MODE_INFO *const mic = xd->mode_info_context;
 
-  vpx_memcpy(&t_above, xd->above_context,
-             sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context,
-             sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
+  vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
 
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
+  bmode_costs = mb->mbmode_cost;
 
-  xd->mode_info_context->mbmi.mode = B_PRED;
-  bmode_costs = mb->inter_bmode_costs;
+  for (idy = 0; idy < 2; idy += bh) {
+    for (idx = 0; idx < 2; idx += bw) {
+      const int mis = xd->mode_info_stride;
+      MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+      int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry);
+      int UNINITIALIZED_IS_SAFE(d);
+      i = idy * 2 + idx;
 
-  for (i = 0; i < 16; i++) {
-    MODE_INFO *const mic = xd->mode_info_context;
-    const int mis = xd->mode_info_stride;
-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
-    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
+      if (xd->frame_type == KEY_FRAME) {
+        const MB_PREDICTION_MODE A = above_block_mode(mic, i, mis);
+        const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
+                                     left_block_mode(mic, i) : DC_PRED;
 
-    if (xd->frame_type == KEY_FRAME) {
-      const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);
-      const B_PREDICTION_MODE L = left_block_mode(mic, i);
+        bmode_costs  = mb->y_mode_costs[A][L];
+      }
 
-      bmode_costs  = mb->bmode_costs[A][L];
-    }
-#if CONFIG_NEWBINTRAMODES
-    mic->bmi[i].as_mode.context = vp9_find_bpred_context(xd, xd->block + i);
-#endif
+      total_rd += rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
+                                        t_above + idx, t_left + idy,
+                                        &r, &ry, &d, bsize);
+      cost += r;
+      distortion += d;
+      tot_rate_y += ry;
 
-    total_rd += rd_pick_intra4x4block(
-                  cpi, mb, mb->block + i, xd->block + i, &best_mode,
-                  bmode_costs, ta + vp9_block2above[TX_4X4][i],
-                  tl + vp9_block2left[TX_4X4][i], &r, &ry, &d);
+      mic->bmi[i].as_mode.first = best_mode;
+      for (j = 1; j < bh; ++j)
+        mic->bmi[i + j * 2].as_mode.first = best_mode;
+      for (j = 1; j < bw; ++j)
+        mic->bmi[i + j].as_mode.first = best_mode;
 
-    cost += r;
-    distortion += d;
-    tot_rate_y += ry;
-
-    mic->bmi[i].as_mode.first = best_mode;
-
-#if 0  // CONFIG_NEWBINTRAMODES
-    printf("%d %d\n", mic->bmi[i].as_mode.first, mic->bmi[i].as_mode.context);
-#endif
-
-    if (total_rd >= best_rd)
-      break;
+      if (total_rd >= best_rd)
+        break;
+    }
   }
 
   if (total_rd >= best_rd)
@@ -1324,140 +837,68 @@
   *Rate = cost;
   *rate_y = tot_rate_y;
   *Distortion = distortion;
+  xd->mode_info_context->mbmi.mode = mic->bmi[3].as_mode.first;
 
   return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
 }
 
-static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi,
-                                      MACROBLOCK *x,
-                                      int *rate,
-                                      int *rate_tokenonly,
-                                      int *distortion,
-                                      int *skippable,
+static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                      int *rate, int *rate_tokenonly,
+                                      int *distortion, int *skippable,
+                                      BLOCK_SIZE_TYPE bsize,
                                       int64_t txfm_cache[NB_TXFM_MODES]) {
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  MACROBLOCKD *const xd = &x->e_mbd;
   int this_rate, this_rate_tokenonly;
   int this_distortion, s;
   int64_t best_rd = INT64_MAX, this_rd;
+  TX_SIZE UNINITIALIZED_IS_SAFE(best_tx);
+  int i;
+  int *bmode_costs = x->mbmode_cost;
 
-  /* Y Search for 32x32 intra prediction mode */
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    x->e_mbd.mode_info_context->mbmi.mode = mode;
-    vp9_build_intra_predictors_sby_s(&x->e_mbd);
-
-    super_block_yrd(cpi, x, &this_rate_tokenonly,
-                    &this_distortion, &s, txfm_cache);
-    this_rate = this_rate_tokenonly +
-                x->mbmode_cost[x->e_mbd.frame_type]
-                              [x->e_mbd.mode_info_context->mbmi.mode];
-    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
-
-    if (this_rd < best_rd) {
-      mode_selected   = mode;
-      best_rd         = this_rd;
-      *rate           = this_rate;
-      *rate_tokenonly = this_rate_tokenonly;
-      *distortion     = this_distortion;
-      *skippable      = s;
-    }
+  if (bsize < BLOCK_SIZE_SB8X8) {
+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_4X4;
+    return best_rd;
   }
 
-  x->e_mbd.mode_info_context->mbmi.mode = mode_selected;
+  for (i = 0; i < NB_TXFM_MODES; i++)
+    txfm_cache[i] = INT64_MAX;
 
-  return best_rd;
-}
-
-static int64_t rd_pick_intra_sb64y_mode(VP9_COMP *cpi,
-                                        MACROBLOCK *x,
-                                        int *rate,
-                                        int *rate_tokenonly,
-                                        int *distortion,
-                                        int *skippable,
-                                        int64_t txfm_cache[NB_TXFM_MODES]) {
-  MB_PREDICTION_MODE mode;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-  int this_rate, this_rate_tokenonly;
-  int this_distortion, s;
-  int64_t best_rd = INT64_MAX, this_rd;
-
   /* Y Search for 32x32 intra prediction mode */
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    int64_t local_txfm_cache[NB_TXFM_MODES];
+    MODE_INFO *const mic = xd->mode_info_context;
+    const int mis = xd->mode_info_stride;
+
+    if (cpi->common.frame_type == KEY_FRAME) {
+      const MB_PREDICTION_MODE A = above_block_mode(mic, 0, mis);
+      const MB_PREDICTION_MODE L = xd->left_available ?
+                                   left_block_mode(mic, 0) : DC_PRED;
+
+      bmode_costs = x->y_mode_costs[A][L];
+    }
     x->e_mbd.mode_info_context->mbmi.mode = mode;
-    vp9_build_intra_predictors_sb64y_s(&x->e_mbd);
 
-    super_block_64_yrd(cpi, x, &this_rate_tokenonly,
-                       &this_distortion, &s, txfm_cache);
-    this_rate = this_rate_tokenonly +
-                x->mbmode_cost[x->e_mbd.frame_type]
-                              [x->e_mbd.mode_info_context->mbmi.mode];
+    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
+                    bsize, local_txfm_cache);
+
+    this_rate = this_rate_tokenonly + bmode_costs[mode];
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
     if (this_rd < best_rd) {
       mode_selected   = mode;
       best_rd         = this_rd;
+      best_tx         = x->e_mbd.mode_info_context->mbmi.txfm_size;
       *rate           = this_rate;
       *rate_tokenonly = this_rate_tokenonly;
       *distortion     = this_distortion;
       *skippable      = s;
     }
-  }
 
-  x->e_mbd.mode_info_context->mbmi.mode = mode_selected;
-
-  return best_rd;
-}
-
-static int64_t rd_pick_intra16x16mby_mode(VP9_COMP *cpi,
-                                          MACROBLOCK *x,
-                                          int *Rate,
-                                          int *rate_y,
-                                          int *Distortion,
-                                          int *skippable,
-                                          int64_t txfm_cache[NB_TXFM_MODES]) {
-  MB_PREDICTION_MODE mode;
-  TX_SIZE txfm_size = 0;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  int rate, ratey;
-  int distortion, skip;
-  int64_t best_rd = INT64_MAX;
-  int64_t this_rd;
-
-  int i;
-  for (i = 0; i < NB_TXFM_MODES; i++)
-    txfm_cache[i] = INT64_MAX;
-
-  // Y Search for 16x16 intra prediction mode
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    int64_t local_txfm_cache[NB_TXFM_MODES];
-
-    mbmi->mode = mode;
-
-    vp9_build_intra_predictors_mby(xd);
-
-    macro_block_yrd(cpi, x, &ratey, &distortion, &skip, local_txfm_cache);
-
-    // FIXME add compoundmode cost
-    // FIXME add rate for mode2
-    rate = ratey + x->mbmode_cost[xd->frame_type][mbmi->mode];
-
-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-
-    if (this_rd < best_rd) {
-      mode_selected = mode;
-      txfm_size = mbmi->txfm_size;
-      best_rd = this_rd;
-      *Rate = rate;
-      *rate_y = ratey;
-      *Distortion = distortion;
-      *skippable = skip;
-    }
-
     for (i = 0; i < NB_TXFM_MODES; i++) {
       int64_t adj_rd = this_rd + local_txfm_cache[i] -
-                        local_txfm_cache[cpi->common.txfm_mode];
+                       local_txfm_cache[cpi->common.txfm_mode];
       if (adj_rd < txfm_cache[i]) {
         txfm_cache[i] = adj_rd;
       }
@@ -1464,760 +905,55 @@
     }
   }
 
-  mbmi->txfm_size = txfm_size;
-  mbmi->mode = mode_selected;
+  x->e_mbd.mode_info_context->mbmi.mode = mode_selected;
+  x->e_mbd.mode_info_context->mbmi.txfm_size = best_tx;
 
   return best_rd;
 }
 
-
-static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
-                                     B_PREDICTION_MODE *best_mode,
-                                     int *mode_costs,
-                                     ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                                     int *bestrate, int *bestratey,
-                                     int *bestdistortion) {
-  VP9_COMMON *const cm = &cpi->common;
-  MB_PREDICTION_MODE mode;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int64_t best_rd = INT64_MAX;
-  int distortion = 0, rate = 0;
-  BLOCK  *be = x->block + ib;
-  BLOCKD *b = xd->block + ib;
-  ENTROPY_CONTEXT_PLANES ta, tl;
-  ENTROPY_CONTEXT *ta0, *ta1, besta0 = 0, besta1 = 0;
-  ENTROPY_CONTEXT *tl0, *tl1, bestl0 = 0, bestl1 = 0;
-
-  /*
-   * The predictor buffer is a 2d buffer with a stride of 16.  Create
-   * a temp buffer that meets the stride requirements, but we are only
-   * interested in the left 8x8 block
-   * */
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, best_predictor, 16 * 8);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, best_dqcoeff, 16 * 4);
-
-  // perform transformation of dimension 8x8
-  // note the input and output index mapping
-  int idx = (ib & 0x02) ? (ib + 2) : ib;
-
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    int64_t this_rd;
-    int rate_t = 0;
-
-    // FIXME rate for compound mode and second intrapred mode
-    rate = mode_costs[mode];
-    b->bmi.as_mode.first = mode;
-
-    vp9_intra8x8_predict(xd, b, mode, b->predictor);
-
-    vp9_subtract_4b_c(be, b, 16);
-
-    if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
-      TX_TYPE tx_type = get_tx_type_8x8(xd, ib);
-      if (tx_type != DCT_DCT)
-        vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type);
-      else
-        x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32);
-      x->quantize_b_8x8(x, idx, tx_type);
-
-      // compute quantization mse of 8x8 block
-      distortion = vp9_block_error_c((x->block + idx)->coeff,
-                                     (xd->block + idx)->dqcoeff, 64);
-
-      vpx_memcpy(&ta, a, sizeof(ENTROPY_CONTEXT_PLANES));
-      vpx_memcpy(&tl, l, sizeof(ENTROPY_CONTEXT_PLANES));
-
-      ta0 = ((ENTROPY_CONTEXT*)&ta) + vp9_block2above[TX_8X8][idx];
-      tl0 = ((ENTROPY_CONTEXT*)&tl) + vp9_block2left[TX_8X8][idx];
-      ta1 = ta0 + 1;
-      tl1 = tl0 + 1;
-
-      rate_t = cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
-                           ta0, tl0, TX_8X8);
-
-      rate += rate_t;
-    } else {
-      static const int iblock[4] = {0, 1, 4, 5};
-      TX_TYPE tx_type;
-      int i;
-      vpx_memcpy(&ta, a, sizeof(ENTROPY_CONTEXT_PLANES));
-      vpx_memcpy(&tl, l, sizeof(ENTROPY_CONTEXT_PLANES));
-      ta0 = ((ENTROPY_CONTEXT*)&ta) + vp9_block2above[TX_4X4][ib];
-      tl0 = ((ENTROPY_CONTEXT*)&tl) + vp9_block2left[TX_4X4][ib];
-      ta1 = ta0 + 1;
-      tl1 = tl0 + 1;
-      distortion = 0;
-      rate_t = 0;
-      for (i = 0; i < 4; ++i) {
-        int do_two = 0;
-        b = &xd->block[ib + iblock[i]];
-        be = &x->block[ib + iblock[i]];
-        tx_type = get_tx_type_4x4(xd, ib + iblock[i]);
-        if (tx_type != DCT_DCT) {
-          vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);
-          vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type);
-        } else if (!(i & 1) &&
-                   get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) {
-          x->fwd_txm8x4(be->src_diff, be->coeff, 32);
-          x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1);
-          do_two = 1;
-        } else {
-          x->fwd_txm4x4(be->src_diff, be->coeff, 32);
-          x->quantize_b_4x4(x, ib + iblock[i]);
-        }
-        distortion += vp9_block_error_c(be->coeff, b->dqcoeff, 16 << do_two);
-        rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
-                              i&1 ? ta1 : ta0, i&2 ? tl1 : tl0,
-                              TX_4X4);
-        if (do_two) {
-          i++;
-          rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
-                                i&1 ? ta1 : ta0, i&2 ? tl1 : tl0,
-                                TX_4X4);
-        }
-      }
-      b = &xd->block[ib];
-      be = &x->block[ib];
-      rate += rate_t;
-    }
-
-    distortion >>= 2;
-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-    if (this_rd < best_rd) {
-      *bestrate = rate;
-      *bestratey = rate_t;
-      *bestdistortion = distortion;
-      besta0 = *ta0;
-      besta1 = *ta1;
-      bestl0 = *tl0;
-      bestl1 = *tl1;
-      best_rd = this_rd;
-      *best_mode = mode;
-      copy_predictor_8x8(best_predictor, b->predictor);
-      vpx_memcpy(best_dqcoeff, b->dqcoeff, 64);
-      vpx_memcpy(best_dqcoeff + 32, b->dqcoeff + 64, 64);
-    }
-  }
-  b->bmi.as_mode.first = (*best_mode);
-  vp9_encode_intra8x8(x, ib);
-
-  if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
-    a[vp9_block2above[TX_8X8][idx]]     = besta0;
-    a[vp9_block2above[TX_8X8][idx] + 1] = besta1;
-    l[vp9_block2left[TX_8X8][idx]]      = bestl0;
-    l[vp9_block2left[TX_8X8][idx] + 1]  = bestl1;
-  } else {
-    a[vp9_block2above[TX_4X4][ib]]     = besta0;
-    a[vp9_block2above[TX_4X4][ib + 1]] = besta1;
-    l[vp9_block2left[TX_4X4][ib]]      = bestl0;
-    l[vp9_block2left[TX_4X4][ib + 4]]  = bestl1;
-  }
-
-  return best_rd;
-}
-
-static int64_t rd_pick_intra8x8mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
-                                         int *Rate, int *rate_y,
-                                         int *Distortion, int64_t best_rd) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  int i, ib;
-  int cost = mb->mbmode_cost [xd->frame_type] [I8X8_PRED];
-  int distortion = 0;
-  int tot_rate_y = 0;
-  int64_t total_rd = 0;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta, *tl;
-  int *i8x8mode_costs;
-
-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-
-  xd->mode_info_context->mbmi.mode = I8X8_PRED;
-  i8x8mode_costs  = mb->i8x8_mode_costs;
-
-  for (i = 0; i < 4; i++) {
-    MODE_INFO *const mic = xd->mode_info_context;
-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
-    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
-
-    ib = vp9_i8x8_block[i];
-    total_rd += rd_pick_intra8x8block(
-                  cpi, mb, ib, &best_mode,
-                  i8x8mode_costs, ta, tl, &r, &ry, &d);
-    cost += r;
-    distortion += d;
-    tot_rate_y += ry;
-    mic->bmi[ib].as_mode.first = best_mode;
-  }
-
-  *Rate = cost;
-  *rate_y = tot_rate_y;
-  *Distortion = distortion;
-  return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
-}
-
-static int64_t rd_pick_intra8x8mby_modes_and_txsz(VP9_COMP *cpi, MACROBLOCK *x,
-                                                  int *rate, int *rate_y,
-                                                  int *distortion,
-                                                  int *mode8x8,
-                                                  int64_t best_yrd,
-                                                  int64_t *txfm_cache) {
-  VP9_COMMON *const cm = &cpi->common;
+static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
+                                      int *rate, int *distortion,
+                                      int *skippable, BLOCK_SIZE_TYPE bsize,
+                                      TX_SIZE uv_tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  int cost0 = vp9_cost_bit(cm->prob_tx[0], 0);
-  int cost1 = vp9_cost_bit(cm->prob_tx[0], 1);
-  int64_t tmp_rd_4x4s, tmp_rd_8x8s;
-  int64_t tmp_rd_4x4, tmp_rd_8x8, tmp_rd;
-  int r4x4, tok4x4, d4x4, r8x8, tok8x8, d8x8;
+  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
+    vp9_encode_intra_block_uv(cm, x, bsize);
+  else
+    vp9_xform_quant_sbuv(cm, x, bsize);
 
-  mbmi->txfm_size = TX_4X4;
-  tmp_rd_4x4 = rd_pick_intra8x8mby_modes(cpi, x, &r4x4, &tok4x4,
-                                         &d4x4, best_yrd);
-  mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
-  mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
-  mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
-  mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
-  mbmi->txfm_size = TX_8X8;
-  tmp_rd_8x8 = rd_pick_intra8x8mby_modes(cpi, x, &r8x8, &tok8x8,
-                                         &d8x8, best_yrd);
-  txfm_cache[ONLY_4X4]  = tmp_rd_4x4;
-  txfm_cache[ALLOW_8X8] = tmp_rd_8x8;
-  txfm_cache[ALLOW_16X16] = tmp_rd_8x8;
-  tmp_rd_4x4s = tmp_rd_4x4 + RDCOST(x->rdmult, x->rddiv, cost0, 0);
-  tmp_rd_8x8s = tmp_rd_8x8 + RDCOST(x->rdmult, x->rddiv, cost1, 0);
-  txfm_cache[TX_MODE_SELECT] = tmp_rd_4x4s < tmp_rd_8x8s ?
-                               tmp_rd_4x4s : tmp_rd_8x8s;
-  if (cm->txfm_mode == TX_MODE_SELECT) {
-    if (tmp_rd_4x4s < tmp_rd_8x8s) {
-      *rate = r4x4 + cost0;
-      *rate_y = tok4x4 + cost0;
-      *distortion = d4x4;
-      mbmi->txfm_size = TX_4X4;
-      tmp_rd = tmp_rd_4x4s;
-    } else {
-      *rate = r8x8 + cost1;
-      *rate_y = tok8x8 + cost1;
-      *distortion = d8x8;
-      mbmi->txfm_size = TX_8X8;
-      tmp_rd = tmp_rd_8x8s;
-
-      mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
-      mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
-      mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
-      mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
-    }
-  } else if (cm->txfm_mode == ONLY_4X4) {
-    *rate = r4x4;
-    *rate_y = tok4x4;
-    *distortion = d4x4;
-    mbmi->txfm_size = TX_4X4;
-    tmp_rd = tmp_rd_4x4;
-  } else {
-    *rate = r8x8;
-    *rate_y = tok8x8;
-    *distortion = d8x8;
-    mbmi->txfm_size = TX_8X8;
-    tmp_rd = tmp_rd_8x8;
-
-    mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
-    mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
-    mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
-    mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
-  }
-
-  return tmp_rd;
+  *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2);
+  *rate       = rdcost_uv(cm, x, bsize, uv_tx_size);
+  *skippable  = vp9_sbuv_is_skippable(xd, bsize);
 }
 
-static int rd_cost_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) {
-  int b;
-  int cost = 0;
-  MACROBLOCKD *xd = &mb->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta, *tl;
-
-  if (backup) {
-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-    ta = (ENTROPY_CONTEXT *)&t_above;
-    tl = (ENTROPY_CONTEXT *)&t_left;
-  } else {
-    ta = (ENTROPY_CONTEXT *)xd->above_context;
-    tl = (ENTROPY_CONTEXT *)xd->left_context;
-  }
-
-  for (b = 16; b < 24; b++)
-    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_UV,
-                        ta + vp9_block2above[TX_4X4][b],
-                        tl + vp9_block2left[TX_4X4][b],
-                        TX_4X4);
-
-  return cost;
-}
-
-
-static int64_t rd_inter16x16_uv_4x4(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                                    int *distortion, int fullpixel, int *skip,
-                                    int do_ctx_backup) {
-  vp9_transform_mbuv_4x4(x);
-  vp9_quantize_mbuv_4x4(x);
-
-  *rate       = rd_cost_mbuv_4x4(&cpi->common, x, do_ctx_backup);
-  *distortion = vp9_mbuverror(x) / 4;
-  *skip       = vp9_mbuv_is_skippable_4x4(&x->e_mbd);
-
-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
-}
-
-static int rd_cost_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) {
-  int b;
-  int cost = 0;
-  MACROBLOCKD *xd = &mb->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta, *tl;
-
-  if (backup) {
-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-    ta = (ENTROPY_CONTEXT *)&t_above;
-    tl = (ENTROPY_CONTEXT *)&t_left;
-  } else {
-    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;
-    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;
-  }
-
-  for (b = 16; b < 24; b += 4)
-    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_UV,
-                        ta + vp9_block2above[TX_8X8][b],
-                        tl + vp9_block2left[TX_8X8][b], TX_8X8);
-
-  return cost;
-}
-
-static int64_t rd_inter16x16_uv_8x8(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                                    int *distortion, int fullpixel, int *skip,
-                                    int do_ctx_backup) {
-  vp9_transform_mbuv_8x8(x);
-  vp9_quantize_mbuv_8x8(x);
-
-  *rate       = rd_cost_mbuv_8x8(&cpi->common, x, do_ctx_backup);
-  *distortion = vp9_mbuverror(x) / 4;
-  *skip       = vp9_mbuv_is_skippable_8x8(&x->e_mbd);
-
-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
-}
-
-static int rd_cost_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x, int backup) {
-  int b;
-  int cost = 0;
+static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
+                             int *rate, int *distortion, int *skippable,
+                             BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-  ENTROPY_CONTEXT *ta, *tl;
-
-  if (backup) {
-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
-
-    ta = (ENTROPY_CONTEXT *) &t_above;
-    tl = (ENTROPY_CONTEXT *) &t_left;
-  } else {
-    ta = (ENTROPY_CONTEXT *)xd->above_context;
-    tl = (ENTROPY_CONTEXT *)xd->left_context;
-  }
-
-  for (b = 16; b < 24; b += 4)
-    cost += cost_coeffs(cm, x, b * 4, PLANE_TYPE_UV,
-                        ta + vp9_block2above[TX_8X8][b],
-                        tl + vp9_block2left[TX_8X8][b], TX_16X16);
-
-  return cost;
-}
-
-static void rd_inter32x32_uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                                   int *rate, int *distortion, int *skip,
-                                   int backup) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  vp9_transform_sbuv_16x16(x);
-  vp9_quantize_sbuv_16x16(x);
-
-  *rate       = rd_cost_sbuv_16x16(cm, x, backup);
-  *distortion = vp9_sb_block_error_c(x->coeff + 1024,
-                                     xd->dqcoeff + 1024, 512, 2);
-  *skip       = vp9_sbuv_is_skippable_16x16(xd);
-}
-
-static int64_t rd_inter32x32_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                                int *distortion, int fullpixel, int *skip) {
-  MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
-  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
 
-  if (mbmi->txfm_size >= TX_16X16) {
-    vp9_subtract_sbuv_s_c(x->src_diff,
-                          usrc, vsrc, src_uv_stride,
-                          udst, vdst, dst_uv_stride);
-    rd_inter32x32_uv_16x16(&cpi->common, x, rate, distortion, skip, 1);
-  } else {
-    int n, r = 0, d = 0;
-    int skippable = 1;
-    ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-    ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
-    ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
+  if (mbmi->ref_frame[0] > INTRA_FRAME)
+    vp9_subtract_sbuv(x, bsize);
 
-    memcpy(t_above, xd->above_context, sizeof(t_above));
-    memcpy(t_left, xd->left_context, sizeof(t_left));
-
-    for (n = 0; n < 4; n++) {
-      int x_idx = n & 1, y_idx = n >> 1;
-      int d_tmp, s_tmp, r_tmp;
-
-      xd->above_context = ta + x_idx;
-      xd->left_context = tl + y_idx;
-      vp9_subtract_mbuv_s_c(x->src_diff,
-                            usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                            vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                            src_uv_stride,
-                            udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                            vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                            dst_uv_stride);
-
-      if (mbmi->txfm_size == TX_4X4) {
-        rd_inter16x16_uv_4x4(cpi, x, &r_tmp, &d_tmp, fullpixel, &s_tmp, 0);
-      } else {
-        rd_inter16x16_uv_8x8(cpi, x, &r_tmp, &d_tmp, fullpixel, &s_tmp, 0);
-      }
-
-      r += r_tmp;
-      d += d_tmp;
-      skippable = skippable && s_tmp;
-    }
-
-    *rate = r;
-    *distortion = d;
-    *skip = skippable;
-    xd->left_context = tl;
-    xd->above_context = ta;
-    memcpy(xd->above_context, t_above, sizeof(t_above));
-    memcpy(xd->left_context, t_left, sizeof(t_left));
-  }
-
-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
-}
-
-static void super_block_64_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, int *rate,
-                                int *distortion, int *skip);
-static int64_t rd_inter64x64_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
-                                int *distortion, int fullpixel, int *skip) {
-  super_block_64_uvrd(&cpi->common, x, rate, distortion, skip);
-  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
-}
-
-static void rd_pick_intra_mbuv_mode(VP9_COMP *cpi,
-                                    MACROBLOCK *x,
-                                    int *rate,
-                                    int *rate_tokenonly,
-                                    int *distortion,
-                                    int *skippable) {
-  MB_PREDICTION_MODE mode;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  int64_t best_rd = INT64_MAX;
-  int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
-  int rate_to, UNINITIALIZED_IS_SAFE(skip);
-
-  xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    int rate;
-    int distortion;
-    int64_t this_rd;
-
-    mbmi->uv_mode = mode;
-    vp9_build_intra_predictors_mbuv(&x->e_mbd);
-
-    vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                      x->e_mbd.predictor, x->src.uv_stride);
-    vp9_transform_mbuv_4x4(x);
-    vp9_quantize_mbuv_4x4(x);
-
-    rate_to = rd_cost_mbuv_4x4(&cpi->common, x, 1);
-    rate = rate_to
-           + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
-
-    distortion = vp9_mbuverror(x) / 4;
-
-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-
-    if (this_rd < best_rd) {
-      skip = vp9_mbuv_is_skippable_4x4(xd);
-      best_rd = this_rd;
-      d = distortion;
-      r = rate;
-      *rate_tokenonly = rate_to;
-      mode_selected = mode;
-    }
-  }
-
-  *rate = r;
-  *distortion = d;
-  *skippable = skip;
-
-  mbmi->uv_mode = mode_selected;
-}
-
-static void rd_pick_intra_mbuv_mode_8x8(VP9_COMP *cpi,
-                                        MACROBLOCK *x,
-                                        int *rate,
-                                        int *rate_tokenonly,
-                                        int *distortion,
-                                        int *skippable) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_PREDICTION_MODE mode;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  int64_t best_rd = INT64_MAX;
-  int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
-  int rate_to, UNINITIALIZED_IS_SAFE(skip);
-
-  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    int rate;
-    int distortion;
-    int64_t this_rd;
-
-    mbmi->uv_mode = mode;
-    vp9_build_intra_predictors_mbuv(&x->e_mbd);
-    vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                      x->e_mbd.predictor, x->src.uv_stride);
-    vp9_transform_mbuv_8x8(x);
-
-    vp9_quantize_mbuv_8x8(x);
-
-    rate_to = rd_cost_mbuv_8x8(&cpi->common, x, 1);
-    rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
-
-    distortion = vp9_mbuverror(x) / 4;
-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-
-    if (this_rd < best_rd) {
-      skip = vp9_mbuv_is_skippable_8x8(xd);
-      best_rd = this_rd;
-      d = distortion;
-      r = rate;
-      *rate_tokenonly = rate_to;
-      mode_selected = mode;
-    }
-  }
-  *rate = r;
-  *distortion = d;
-  *skippable = skip;
-  mbmi->uv_mode = mode_selected;
-}
-
-// TODO(rbultje) very similar to rd_inter32x32_uv(), merge?
-static void super_block_uvrd(VP9_COMMON *const cm,
-                             MACROBLOCK *x,
-                             int *rate,
-                             int *distortion,
-                             int *skippable) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
-  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
-
-  if (mbmi->txfm_size >= TX_16X16) {
-    vp9_subtract_sbuv_s_c(x->src_diff,
-                          usrc, vsrc, src_uv_stride,
-                          udst, vdst, dst_uv_stride);
-    rd_inter32x32_uv_16x16(cm, x, rate, distortion, skippable, 1);
+  if (mbmi->txfm_size >= TX_32X32 && bsize >= BLOCK_SIZE_SB64X64) {
+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
+                              TX_32X32);
+  } else if (mbmi->txfm_size >= TX_16X16 && bsize >= BLOCK_SIZE_SB32X32) {
+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
+                              TX_16X16);
+  } else if (mbmi->txfm_size >= TX_8X8 && bsize >= BLOCK_SIZE_MB16X16) {
+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
+                              TX_8X8);
   } else {
-    int d = 0, r = 0, n, s = 1;
-    ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-    ENTROPY_CONTEXT_PLANES *ta_orig = xd->above_context;
-    ENTROPY_CONTEXT_PLANES *tl_orig = xd->left_context;
-
-    memcpy(t_above, xd->above_context, sizeof(t_above));
-    memcpy(t_left,  xd->left_context,  sizeof(t_left));
-
-    for (n = 0; n < 4; n++) {
-      int x_idx = n & 1, y_idx = n >> 1;
-
-      vp9_subtract_mbuv_s_c(x->src_diff,
-                            usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                            vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                            src_uv_stride,
-                            udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                            vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                            dst_uv_stride);
-      if (mbmi->txfm_size == TX_4X4) {
-        vp9_transform_mbuv_4x4(x);
-        vp9_quantize_mbuv_4x4(x);
-        s &= vp9_mbuv_is_skippable_4x4(xd);
-      } else {
-        vp9_transform_mbuv_8x8(x);
-        vp9_quantize_mbuv_8x8(x);
-        s &= vp9_mbuv_is_skippable_8x8(xd);
-      }
-
-      d += vp9_mbuverror(x) >> 2;
-      xd->above_context = t_above + x_idx;
-      xd->left_context = t_left + y_idx;
-      if (mbmi->txfm_size == TX_4X4) {
-        r += rd_cost_mbuv_4x4(cm, x, 0);
-      } else {
-        r += rd_cost_mbuv_8x8(cm, x, 0);
-      }
-    }
-
-    xd->above_context = ta_orig;
-    xd->left_context = tl_orig;
-
-    *distortion = d;
-    *rate       = r;
-    *skippable  = s;
+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
+                              TX_4X4);
   }
 }
 
-static int rd_cost_sb64uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                                int backup) {
-  int b;
-  int cost = 0;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
-  ENTROPY_CONTEXT *ta, *tl;
-
-  if (backup) {
-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 4);
-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES) * 4);
-
-    ta = (ENTROPY_CONTEXT *) &t_above;
-    tl = (ENTROPY_CONTEXT *) &t_left;
-  } else {
-    ta = (ENTROPY_CONTEXT *)xd->above_context;
-    tl = (ENTROPY_CONTEXT *)xd->left_context;
-  }
-
-  for (b = 16; b < 24; b += 4)
-    cost += cost_coeffs(cm, x, b * 16, PLANE_TYPE_UV,
-                        ta + vp9_block2above[TX_8X8][b],
-                        tl + vp9_block2left[TX_8X8][b], TX_32X32);
-
-  return cost;
-}
-
-static void rd_inter64x64_uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                                   int *rate, int *distortion, int *skip,
-                                   int backup) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  vp9_transform_sb64uv_32x32(x);
-  vp9_quantize_sb64uv_32x32(x);
-
-  *rate       = rd_cost_sb64uv_32x32(cm, x, backup);
-  *distortion = vp9_sb_block_error_c(x->coeff + 4096,
-                                     xd->dqcoeff + 4096, 2048, 0);
-  *skip       = vp9_sb64uv_is_skippable_32x32(xd);
-}
-
-static void super_block_64_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
-                                int *rate,
-                                int *distortion,
-                                int *skippable) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
-  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
-  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
-  ENTROPY_CONTEXT_PLANES *ta_orig = xd->above_context;
-  ENTROPY_CONTEXT_PLANES *tl_orig = xd->left_context;
-  int d = 0, r = 0, n, s = 1;
-
-  // FIXME not needed if tx=32x32
-  memcpy(t_above, xd->above_context, sizeof(t_above));
-  memcpy(t_left,  xd->left_context,  sizeof(t_left));
-
-  if (mbmi->txfm_size == TX_32X32) {
-    vp9_subtract_sb64uv_s_c(x->src_diff, usrc, vsrc, src_uv_stride,
-                            udst, vdst, dst_uv_stride);
-    rd_inter64x64_uv_32x32(cm, x, &r, &d, &s, 1);
-  } else if (mbmi->txfm_size == TX_16X16) {
-    int n;
-
-    *rate = 0;
-    for (n = 0; n < 4; n++) {
-      int x_idx = n & 1, y_idx = n >> 1;
-      int r_tmp, d_tmp, s_tmp;
-
-      vp9_subtract_sbuv_s_c(x->src_diff,
-                            usrc + x_idx * 16 + y_idx * 16 * src_uv_stride,
-                            vsrc + x_idx * 16 + y_idx * 16 * src_uv_stride,
-                            src_uv_stride,
-                            udst + x_idx * 16 + y_idx * 16 * dst_uv_stride,
-                            vdst + x_idx * 16 + y_idx * 16 * dst_uv_stride,
-                            dst_uv_stride);
-      xd->above_context = t_above + x_idx * 2;
-      xd->left_context = t_left + y_idx * 2;
-      rd_inter32x32_uv_16x16(cm, x, &r_tmp, &d_tmp, &s_tmp, 0);
-      r += r_tmp;
-      d += d_tmp;
-      s = s && s_tmp;
-    }
-  } else {
-    for (n = 0; n < 16; n++) {
-      int x_idx = n & 3, y_idx = n >> 2;
-
-      vp9_subtract_mbuv_s_c(x->src_diff,
-                            usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                            vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                            src_uv_stride,
-                            udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                            vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                            dst_uv_stride);
-      if (mbmi->txfm_size == TX_4X4) {
-        vp9_transform_mbuv_4x4(x);
-        vp9_quantize_mbuv_4x4(x);
-        s &= vp9_mbuv_is_skippable_4x4(xd);
-      } else {
-        vp9_transform_mbuv_8x8(x);
-        vp9_quantize_mbuv_8x8(x);
-        s &= vp9_mbuv_is_skippable_8x8(xd);
-      }
-
-      xd->above_context = t_above + x_idx;
-      xd->left_context = t_left + y_idx;
-      d += vp9_mbuverror(x) >> 2;
-      if (mbmi->txfm_size == TX_4X4) {
-        r += rd_cost_mbuv_4x4(cm, x, 0);
-      } else {
-        r += rd_cost_mbuv_8x8(cm, x, 0);
-      }
-    }
-  }
-
-  *distortion = d;
-  *rate       = r;
-  *skippable  = s;
-
-  xd->left_context = tl_orig;
-  xd->above_context = ta_orig;
-}
-
-static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi,
-                                       MACROBLOCK *x,
-                                       int *rate,
-                                       int *rate_tokenonly,
-                                       int *distortion,
-                                       int *skippable) {
+static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                       int *rate, int *rate_tokenonly,
+                                       int *distortion, int *skippable,
+                                       BLOCK_SIZE_TYPE bsize) {
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
   int64_t best_rd = INT64_MAX, this_rd;
@@ -2226,10 +962,8 @@
 
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
-    vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
-
     super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
-                     &this_distortion, &s);
+                     &this_distortion, &s, bsize);
     this_rate = this_rate_tokenonly +
                 x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
@@ -2249,43 +983,6 @@
   return best_rd;
 }
 
-static int64_t rd_pick_intra_sb64uv_mode(VP9_COMP *cpi,
-                                         MACROBLOCK *x,
-                                         int *rate,
-                                         int *rate_tokenonly,
-                                         int *distortion,
-                                         int *skippable) {
-  MB_PREDICTION_MODE mode;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-  int64_t best_rd = INT64_MAX, this_rd;
-  int this_rate_tokenonly, this_rate;
-  int this_distortion, s;
-
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
-    vp9_build_intra_predictors_sb64uv_s(&x->e_mbd);
-
-    super_block_64_uvrd(&cpi->common, x, &this_rate_tokenonly,
-                        &this_distortion, &s);
-    this_rate = this_rate_tokenonly +
-    x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];
-    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
-
-    if (this_rd < best_rd) {
-      mode_selected   = mode;
-      best_rd         = this_rd;
-      *rate           = this_rate;
-      *rate_tokenonly = this_rate_tokenonly;
-      *distortion     = this_distortion;
-      *skippable      = s;
-    }
-  }
-
-  x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected;
-
-  return best_rd;
-}
-
 int vp9_cost_mv_ref(VP9_COMP *cpi,
                     MB_PREDICTION_MODE m,
                     const int mode_context) {
@@ -2296,11 +993,11 @@
   if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
     VP9_COMMON *pc = &cpi->common;
 
-    vp9_prob p [VP9_MVREFS - 1];
-    assert(NEARESTMV <= m  &&  m <= SPLITMV);
+    vp9_prob p[VP9_INTER_MODES - 1];
+    assert(NEARESTMV <= m  &&  m <= NEWMV);
     vp9_mv_ref_probs(pc, p, mode_context);
-    return cost_token(vp9_mv_ref_tree, p,
-                      vp9_mv_ref_encoding_array - NEARESTMV + m);
+    return cost_token(vp9_sb_mv_ref_tree, p,
+                      vp9_sb_mv_ref_encoding_array - NEARESTMV + m);
   } else
     return 0;
 }
@@ -2310,112 +1007,81 @@
   x->e_mbd.mode_info_context->mbmi.mv[0].as_int = mv->as_int;
 }
 
-static int labels2mode(
-  MACROBLOCK *x,
-  int const *labelings, int which_label,
-  B_PREDICTION_MODE this_mode,
-  int_mv *this_mv, int_mv *this_second_mv,
-  int_mv seg_mvs[MAX_REF_FRAMES - 1],
-  int_mv *best_ref_mv,
-  int_mv *second_best_ref_mv,
-  int *mvjcost, int *mvcost[2]) {
+static int labels2mode(MACROBLOCK *x, int i,
+                       MB_PREDICTION_MODE this_mode,
+                       int_mv *this_mv, int_mv *this_second_mv,
+                       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
+                       int_mv seg_mvs[MAX_REF_FRAMES],
+                       int_mv *best_ref_mv,
+                       int_mv *second_best_ref_mv,
+                       int *mvjcost, int *mvcost[2], VP9_COMP *cpi) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mic = xd->mode_info_context;
   MB_MODE_INFO * mbmi = &mic->mbmi;
-  const int mis = xd->mode_info_stride;
+  int cost = 0, thismvcost = 0;
+  int idx, idy;
+  int bw = 1 << b_width_log2(mbmi->sb_type);
+  int bh = 1 << b_height_log2(mbmi->sb_type);
 
-  int i, cost = 0, thismvcost = 0;
-
   /* We have to be careful retrieving previously-encoded motion vectors.
-     Ones from this macroblock have to be pulled from the BLOCKD array
-     as they have not yet made it to the bmi array in our MB_MODE_INFO. */
-  for (i = 0; i < 16; ++i) {
-    BLOCKD *const d = xd->block + i;
-    const int row = i >> 2,  col = i & 3;
+   Ones from this macroblock have to be pulled from the BLOCKD array
+   as they have not yet made it to the bmi array in our MB_MODE_INFO. */
+  MB_PREDICTION_MODE m;
 
-    B_PREDICTION_MODE m;
-
-    if (labelings[i] != which_label)
-      continue;
-
-    if (col  &&  labelings[i] == labelings[i - 1])
-      m = LEFT4X4;
-    else if (row  &&  labelings[i] == labelings[i - 4])
-      m = ABOVE4X4;
-    else {
-      // the only time we should do costing for new motion vector or mode
-      // is when we are on a new label  (jbb May 08, 2007)
-      switch (m = this_mode) {
-        case NEW4X4 :
-          if (mbmi->second_ref_frame > 0) {
-            this_mv->as_int = seg_mvs[mbmi->ref_frame - 1].as_int;
-            this_second_mv->as_int =
-              seg_mvs[mbmi->second_ref_frame - 1].as_int;
-          }
-
-          thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,
-                                        102, xd->allow_high_precision_mv);
-          if (mbmi->second_ref_frame > 0) {
-            thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
-                                          mvjcost, mvcost, 102,
-                                          xd->allow_high_precision_mv);
-          }
-          break;
-        case LEFT4X4:
-          this_mv->as_int = col ? d[-1].bmi.as_mv[0].as_int :
-                                  left_block_mv(xd, mic, i);
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int = col ? d[-1].bmi.as_mv[1].as_int :
-                                           left_block_second_mv(xd, mic, i);
-          break;
-        case ABOVE4X4:
-          this_mv->as_int = row ? d[-4].bmi.as_mv[0].as_int :
-                                  above_block_mv(mic, i, mis);
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int = row ? d[-4].bmi.as_mv[1].as_int :
-                                           above_block_second_mv(mic, i, mis);
-          break;
-        case ZERO4X4:
-          this_mv->as_int = 0;
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int = 0;
-          break;
-        default:
-          break;
+  // the only time we should do costing for new motion vector or mode
+  // is when we are on a new label  (jbb May 08, 2007)
+  switch (m = this_mode) {
+    case NEWMV:
+      this_mv->as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
+      thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,
+                                    102, xd->allow_high_precision_mv);
+      if (mbmi->ref_frame[1] > 0) {
+        this_second_mv->as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
+        thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
+                                      mvjcost, mvcost, 102,
+                                      xd->allow_high_precision_mv);
       }
+      break;
+    case NEARESTMV:
+      this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int;
+      if (mbmi->ref_frame[1] > 0)
+        this_second_mv->as_int =
+            frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int;
+      break;
+    case NEARMV:
+      this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame[0]].as_int;
+      if (mbmi->ref_frame[1] > 0)
+        this_second_mv->as_int =
+            frame_mv[NEARMV][mbmi->ref_frame[1]].as_int;
+      break;
+    case ZEROMV:
+      this_mv->as_int = 0;
+      if (mbmi->ref_frame[1] > 0)
+        this_second_mv->as_int = 0;
+      break;
+    default:
+      break;
+  }
 
-      if (m == ABOVE4X4) { // replace above with left if same
-        int_mv left_mv, left_second_mv;
+  cost = vp9_cost_mv_ref(cpi, this_mode,
+                         mbmi->mb_mode_context[mbmi->ref_frame[0]]);
 
-        left_second_mv.as_int = 0;
-        left_mv.as_int = col ? d[-1].bmi.as_mv[0].as_int :
-                         left_block_mv(xd, mic, i);
-        if (mbmi->second_ref_frame > 0)
-          left_second_mv.as_int = col ? d[-1].bmi.as_mv[1].as_int :
-                                  left_block_second_mv(xd, mic, i);
+  mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
+  if (mbmi->ref_frame[1] > 0)
+    mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
 
-        if (left_mv.as_int == this_mv->as_int &&
-            (mbmi->second_ref_frame <= 0 ||
-             left_second_mv.as_int == this_second_mv->as_int))
-          m = LEFT4X4;
-      }
-
-#if CONFIG_NEWBINTRAMODES
-      cost = x->inter_bmode_costs[
-          m == B_CONTEXT_PRED ? m - CONTEXT_PRED_REPLACEMENTS : m];
-#else
-      cost = x->inter_bmode_costs[m];
-#endif
+  x->partition_info->bmi[i].mode = m;
+  x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
+  if (mbmi->ref_frame[1] > 0)
+    x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
+  for (idy = 0; idy < bh; ++idy) {
+    for (idx = 0; idx < bw; ++idx) {
+      vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
+                 &mic->bmi[i], sizeof(mic->bmi[i]));
+      vpx_memcpy(&x->partition_info->bmi[i + idy * 2 + idx],
+                 &x->partition_info->bmi[i],
+                 sizeof(x->partition_info->bmi[i]));
     }
-
-    d->bmi.as_mv[0].as_int = this_mv->as_int;
-    if (mbmi->second_ref_frame > 0)
-      d->bmi.as_mv[1].as_int = this_second_mv->as_int;
-
-    x->partition_info->bmi[i].mode = m;
-    x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
-    if (mbmi->second_ref_frame > 0)
-      x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
   }
 
   cost += thismvcost;
@@ -2424,203 +1090,102 @@
 
 static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
                                        MACROBLOCK *x,
-                                       int const *labels,
-                                       int which_label,
+                                       int i,
                                        int *labelyrate,
                                        int *distortion,
                                        ENTROPY_CONTEXT *ta,
                                        ENTROPY_CONTEXT *tl) {
-  int i;
+  int k;
   MACROBLOCKD *xd = &x->e_mbd;
+  BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+  int bwl = b_width_log2(bsize), bw = 1 << bwl;
+  int bhl = b_height_log2(bsize), bh = 1 << bhl;
+  int idx, idy;
+  const int src_stride = x->plane[0].src.stride;
+  uint8_t* const src =
+  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                            x->plane[0].src.buf, src_stride);
+  int16_t* src_diff =
+  raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i,
+                            x->plane[0].src_diff);
+  int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
+  uint8_t* const pre =
+  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                            xd->plane[0].pre[0].buf,
+                            xd->plane[0].pre[0].stride);
+  uint8_t* const dst =
+  raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                            xd->plane[0].dst.buf,
+                            xd->plane[0].dst.stride);
+  int thisdistortion = 0;
+  int thisrate = 0;
 
   *labelyrate = 0;
   *distortion = 0;
-  for (i = 0; i < 16; i++) {
-    if (labels[i] == which_label) {
-      BLOCKD *bd = &x->e_mbd.block[i];
-      BLOCK *be = &x->block[i];
-      int thisdistortion;
 
-      vp9_build_inter_predictor(*(bd->base_pre) + bd->pre,
-                                bd->pre_stride,
-                                bd->predictor, 16,
-                                &bd->bmi.as_mv[0],
-                                &xd->scale_factor[0],
-                                4, 4, 0 /* no avg */, &xd->subpix);
+  vp9_build_inter_predictor(pre,
+                            xd->plane[0].pre[0].stride,
+                            dst,
+                            xd->plane[0].dst.stride,
+                            &xd->mode_info_context->bmi[i].as_mv[0],
+                            &xd->scale_factor[0],
+                            4 * bw, 4 * bh, 0 /* no avg */, &xd->subpix);
 
-      // TODO(debargha): Make this work properly with the
-      // implicit-compoundinter-weight experiment when implicit
-      // weighting for splitmv modes is turned on.
-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        vp9_build_inter_predictor(
-            *(bd->base_second_pre) + bd->pre, bd->pre_stride, bd->predictor, 16,
-            &bd->bmi.as_mv[1], &xd->scale_factor[1], 4, 4,
-            1 << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT) /* avg */,
-            &xd->subpix);
-      }
-
-      vp9_subtract_b(be, bd, 16);
-      x->fwd_txm4x4(be->src_diff, be->coeff, 32);
-      x->quantize_b_4x4(x, i);
-      thisdistortion = vp9_block_error(be->coeff, bd->dqcoeff, 16);
-      *distortion += thisdistortion;
-      *labelyrate += cost_coeffs(cm, x, i, PLANE_TYPE_Y_WITH_DC,
-                                 ta + vp9_block2above[TX_4X4][i],
-                                 tl + vp9_block2left[TX_4X4][i], TX_4X4);
-    }
+  // TODO(debargha): Make this work properly with the
+  // implicit-compoundinter-weight experiment when implicit
+  // weighting for splitmv modes is turned on.
+  if (xd->mode_info_context->mbmi.ref_frame[1] > 0) {
+    uint8_t* const second_pre =
+    raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
+                              xd->plane[0].pre[1].buf,
+                              xd->plane[0].pre[1].stride);
+    vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride,
+                              dst, xd->plane[0].dst.stride,
+                              &xd->mode_info_context->bmi[i].as_mv[1],
+                              &xd->scale_factor[1], 4 * bw, 4 * bh, 1,
+                              &xd->subpix);
   }
-  *distortion >>= 2;
-  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
-}
 
-static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,
-                                           MACROBLOCK *x,
-                                           int const *labels,
-                                           int which_label,
-                                           int *labelyrate,
-                                           int *distortion,
-                                           int64_t *otherrd,
-                                           ENTROPY_CONTEXT *ta,
-                                           ENTROPY_CONTEXT *tl) {
-  int i, j;
-  MACROBLOCKD *xd = &x->e_mbd;
-  const int iblock[4] = { 0, 1, 4, 5 };
-  int othercost = 0, otherdist = 0;
-  ENTROPY_CONTEXT_PLANES tac, tlc;
-  ENTROPY_CONTEXT *tacp = (ENTROPY_CONTEXT *) &tac,
-                  *tlcp = (ENTROPY_CONTEXT *) &tlc;
+  vp9_subtract_block(4 * bh, 4 * bw, src_diff, 8,
+                     src, src_stride,
+                     dst, xd->plane[0].dst.stride);
 
-  if (otherrd) {
-    memcpy(&tac, ta, sizeof(ENTROPY_CONTEXT_PLANES));
-    memcpy(&tlc, tl, sizeof(ENTROPY_CONTEXT_PLANES));
-  }
-
-  *distortion = 0;
-  *labelyrate = 0;
-  for (i = 0; i < 4; i++) {
-    int ib = vp9_i8x8_block[i];
-
-    if (labels[ib] == which_label) {
-      const int use_second_ref =
-          xd->mode_info_context->mbmi.second_ref_frame > 0;
-      int which_mv;
-      int idx = (ib & 8) + ((ib & 2) << 1);
-      BLOCKD *bd = &xd->block[ib], *bd2 = &xd->block[idx];
-      BLOCK *be = &x->block[ib], *be2 = &x->block[idx];
-      int thisdistortion;
-
-      for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-        uint8_t **base_pre = which_mv ? bd->base_second_pre : bd->base_pre;
-
-        // TODO(debargha): Make this work properly with the
-        // implicit-compoundinter-weight experiment when implicit
-        // weighting for splitmv modes is turned on.
-        vp9_build_inter_predictor(
-            *base_pre + bd->pre, bd->pre_stride, bd->predictor, 16,
-            &bd->bmi.as_mv[which_mv], &xd->scale_factor[which_mv], 8, 8,
-            which_mv << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT),
-            &xd->subpix);
-      }
-
-      vp9_subtract_4b_c(be, bd, 16);
-
-      if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) {
-        if (otherrd) {
-          x->fwd_txm8x8(be->src_diff, be2->coeff, 32);
-          x->quantize_b_8x8(x, idx, DCT_DCT);
-          thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
-          otherdist += thisdistortion;
-          xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-          othercost += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
-                                   tacp + vp9_block2above[TX_8X8][idx],
-                                   tlcp + vp9_block2left[TX_8X8][idx],
-                                   TX_8X8);
-          xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-        }
-        for (j = 0; j < 4; j += 2) {
-          bd = &xd->block[ib + iblock[j]];
-          be = &x->block[ib + iblock[j]];
-          x->fwd_txm8x4(be->src_diff, be->coeff, 32);
-          x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1);
-          thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);
-          *distortion += thisdistortion;
-          *labelyrate +=
-              cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
-                          ta + vp9_block2above[TX_4X4][ib + iblock[j]],
-                          tl + vp9_block2left[TX_4X4][ib + iblock[j]],
-                          TX_4X4);
-          *labelyrate +=
-              cost_coeffs(cm, x, ib + iblock[j] + 1,
-                          PLANE_TYPE_Y_WITH_DC,
-                          ta + vp9_block2above[TX_4X4][ib + iblock[j] + 1],
-                          tl + vp9_block2left[TX_4X4][ib + iblock[j]],
-                          TX_4X4);
-        }
-      } else /* 8x8 */ {
-        if (otherrd) {
-          for (j = 0; j < 4; j += 2) {
-            BLOCKD *bd = &xd->block[ib + iblock[j]];
-            BLOCK *be = &x->block[ib + iblock[j]];
-            x->fwd_txm8x4(be->src_diff, be->coeff, 32);
-            x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j]);
-            thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);
-            otherdist += thisdistortion;
-            xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-            othercost +=
-                cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
-                            tacp + vp9_block2above[TX_4X4][ib + iblock[j]],
-                            tlcp + vp9_block2left[TX_4X4][ib + iblock[j]],
-                            TX_4X4);
-            othercost +=
-                cost_coeffs(cm, x, ib + iblock[j] + 1,
-                            PLANE_TYPE_Y_WITH_DC,
-                            tacp + vp9_block2above[TX_4X4][ib + iblock[j] + 1],
-                            tlcp + vp9_block2left[TX_4X4][ib + iblock[j]],
-                            TX_4X4);
-            xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-          }
-        }
-        x->fwd_txm8x8(be->src_diff, be2->coeff, 32);
-        x->quantize_b_8x8(x, idx, DCT_DCT);
-        thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
-        *distortion += thisdistortion;
-        *labelyrate += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
-                                   ta + vp9_block2above[TX_8X8][idx],
-                                   tl + vp9_block2left[TX_8X8][idx], TX_8X8);
-      }
+  k = i;
+  for (idy = 0; idy < bh; ++idy) {
+    for (idx = 0; idx < bw; ++idx) {
+      k += (idy * 2 + idx);
+      src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,
+                                           x->plane[0].src_diff);
+      coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, k);
+      x->fwd_txm4x4(src_diff, coeff, 16);
+      x->quantize_b_4x4(x, k, DCT_DCT, 16);
+      thisdistortion += vp9_block_error(coeff,
+                                        BLOCK_OFFSET(xd->plane[0].dqcoeff,
+                                                     k, 16), 16);
+      thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC,
+                              ta + (k & 1),
+                              tl + (k >> 1), TX_4X4, 16);
     }
   }
+  *distortion += thisdistortion;
+  *labelyrate += thisrate;
+
   *distortion >>= 2;
-  if (otherrd) {
-    otherdist >>= 2;
-    *otherrd = RDCOST(x->rdmult, x->rddiv, othercost, otherdist);
-  }
   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
 }
 
-static const unsigned int segmentation_to_sseshift[4] = {3, 3, 2, 0};
-
-
 typedef struct {
   int_mv *ref_mv, *second_ref_mv;
   int_mv mvp;
 
   int64_t segment_rd;
-  SPLITMV_PARTITIONING_TYPE segment_num;
-  TX_SIZE txfm_size;
   int r;
   int d;
   int segment_yrate;
-  B_PREDICTION_MODE modes[16];
-  int_mv mvs[16], second_mvs[16];
-  int eobs[16];
-
+  MB_PREDICTION_MODE modes[4];
+  int_mv mvs[4], second_mvs[4];
+  int eobs[4];
   int mvthresh;
-  int *mdcounts;
-
-  int_mv sv_mvp[4];     // save 4 mvp from 8x8
-  int sv_istep[2];  // save 2 initial step_param for 16x8/8x16
-
 } BEST_SEG_INFO;
 
 static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
@@ -2632,47 +1197,113 @@
   return r;
 }
 
+static enum BlockSize get_block_size(int bw, int bh) {
+  if (bw == 4 && bh == 4)
+    return BLOCK_4X4;
+
+  if (bw == 4 && bh == 8)
+    return BLOCK_4X8;
+
+  if (bw == 8 && bh == 4)
+    return BLOCK_8X4;
+
+  if (bw == 8 && bh == 8)
+    return BLOCK_8X8;
+
+  if (bw == 8 && bh == 16)
+    return BLOCK_8X16;
+
+  if (bw == 16 && bh == 8)
+    return BLOCK_16X8;
+
+  if (bw == 16 && bh == 16)
+    return BLOCK_16X16;
+
+  if (bw == 32 && bh == 32)
+    return BLOCK_32X32;
+
+  if (bw == 32 && bh == 16)
+    return BLOCK_32X16;
+
+  if (bw == 16 && bh == 32)
+    return BLOCK_16X32;
+
+  if (bw == 64 && bh == 32)
+    return BLOCK_64X32;
+
+  if (bw == 32 && bh == 64)
+    return BLOCK_32X64;
+
+  if (bw == 64 && bh == 64)
+    return BLOCK_64X64;
+
+  assert(0);
+  return -1;
+}
+
+static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
+  MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
+  x->plane[0].src.buf =
+      raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,
+                                x->plane[0].src.buf,
+                                x->plane[0].src.stride);
+  assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0x7) == 0);
+  x->e_mbd.plane[0].pre[0].buf =
+      raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,
+                                x->e_mbd.plane[0].pre[0].buf,
+                                x->e_mbd.plane[0].pre[0].stride);
+  if (mbmi->ref_frame[1])
+    x->e_mbd.plane[0].pre[1].buf =
+        raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i,
+                                  x->e_mbd.plane[0].pre[1].buf,
+                                  x->e_mbd.plane[0].pre[1].stride);
+}
+
+static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
+                                  struct buf_2d orig_pre[2]) {
+  MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi;
+  x->plane[0].src = orig_src;
+  x->e_mbd.plane[0].pre[0] = orig_pre[0];
+  if (mbmi->ref_frame[1])
+    x->e_mbd.plane[0].pre[1] = orig_pre[1];
+}
+
+static void iterative_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE_TYPE bsize,
+                                    int_mv *frame_mv,
+                                    YV12_BUFFER_CONFIG **scaled_ref_frame,
+                                    int mi_row, int mi_col,
+                                    int_mv single_newmv[MAX_REF_FRAMES]);
+
 static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
                                     BEST_SEG_INFO *bsi,
-                                    SPLITMV_PARTITIONING_TYPE segmentation,
-                                    TX_SIZE tx_size, int64_t *otherrds,
-                                    int64_t *rds, int *completed,
-                                    /* 16 = n_blocks */
-                                    int_mv seg_mvs[16 /* n_blocks */]
-                                                  [MAX_REF_FRAMES - 1]) {
+                                    int_mv seg_mvs[4][MAX_REF_FRAMES],
+                                    int mi_row, int mi_col) {
   int i, j;
-  int const *labels;
   int br = 0, bd = 0;
-  B_PREDICTION_MODE this_mode;
+  MB_PREDICTION_MODE this_mode;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-
-  int label_count;
+  const int label_count = 4;
   int64_t this_segment_rd = 0, other_segment_rd;
   int label_mv_thresh;
   int rate = 0;
   int sbr = 0, sbd = 0;
   int segmentyrate = 0;
-  int best_eobs[16] = { 0 };
-
+  int best_eobs[4] = { 0 };
+  BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
+  int bwl = b_width_log2(bsize), bw = 1 << bwl;
+  int bhl = b_height_log2(bsize), bh = 1 << bhl;
+  int idx, idy;
   vp9_variance_fn_ptr_t *v_fn_ptr;
+  YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
+  ENTROPY_CONTEXT t_above[4], t_left[4];
+  ENTROPY_CONTEXT t_above_b[4], t_left_b[4];
 
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta, *tl;
-  ENTROPY_CONTEXT_PLANES t_above_b, t_left_b;
-  ENTROPY_CONTEXT *ta_b, *tl_b;
+  vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));
+  vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));
 
-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  v_fn_ptr = &cpi->fn_ptr[get_block_size(4 << bwl, 4 << bhl)];
 
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-  ta_b = (ENTROPY_CONTEXT *)&t_above_b;
-  tl_b = (ENTROPY_CONTEXT *)&t_left_b;
-
-  v_fn_ptr = &cpi->fn_ptr[segmentation];
-  labels = vp9_mbsplits[segmentation];
-  label_count = vp9_mbsplit_count[segmentation];
-
   // 64 makes this threshold really big effectively
   // making it so that we very rarely check mvs on
   // segments.   setting this to 1 would make mv thresh
@@ -2680,206 +1311,195 @@
   label_mv_thresh = 1 * bsi->mvthresh / label_count;
 
   // Segmentation method overheads
-  rate = cost_token(vp9_mbsplit_tree, vp9_mbsplit_probs,
-                    vp9_mbsplit_encodings + segmentation);
-  rate += vp9_cost_mv_ref(cpi, SPLITMV,
-                          mbmi->mb_mode_context[mbmi->ref_frame]);
-  this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
-  br += rate;
   other_segment_rd = this_segment_rd;
 
-  mbmi->txfm_size = tx_size;
-  for (i = 0; i < label_count && this_segment_rd < bsi->segment_rd; i++) {
-    int_mv mode_mv[B_MODE_COUNT], second_mode_mv[B_MODE_COUNT];
-    int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;
-    B_PREDICTION_MODE mode_selected = ZERO4X4;
-    int bestlabelyrate = 0;
+  for (idy = 0; idy < 2; idy += bh) {
+    for (idx = 0; idx < 2; idx += bw) {
+      // TODO(jingning,rbultje): rewrite the rate-distortion optimization
+      // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
+      int_mv mode_mv[MB_MODE_COUNT], second_mode_mv[MB_MODE_COUNT];
+      int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+      int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;
+      MB_PREDICTION_MODE mode_selected = ZEROMV;
+      int bestlabelyrate = 0;
+      i = idy * 2 + idx;
 
-    // search for the best motion vector on this segment
-    for (this_mode = LEFT4X4; this_mode <= NEW4X4; this_mode ++) {
-      int64_t this_rd, other_rd;
-      int distortion;
-      int labelyrate;
-      ENTROPY_CONTEXT_PLANES t_above_s, t_left_s;
-      ENTROPY_CONTEXT *ta_s;
-      ENTROPY_CONTEXT *tl_s;
+      frame_mv[ZEROMV][mbmi->ref_frame[0]].as_int = 0;
+      frame_mv[ZEROMV][mbmi->ref_frame[1]].as_int = 0;
+      vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd,
+                                    &frame_mv[NEARESTMV][mbmi->ref_frame[0]],
+                                    &frame_mv[NEARMV][mbmi->ref_frame[0]],
+                                    i, 0);
+      if (mbmi->ref_frame[1] > 0)
+        vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd,
+                                   &frame_mv[NEARESTMV][mbmi->ref_frame[1]],
+                                   &frame_mv[NEARMV][mbmi->ref_frame[1]],
+                                   i, 1);
 
-      vpx_memcpy(&t_above_s, &t_above, sizeof(ENTROPY_CONTEXT_PLANES));
-      vpx_memcpy(&t_left_s, &t_left, sizeof(ENTROPY_CONTEXT_PLANES));
+      // search for the best motion vector on this segment
+      for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+        int64_t this_rd;
+        int distortion;
+        int labelyrate;
+        ENTROPY_CONTEXT t_above_s[4], t_left_s[4];
+        const struct buf_2d orig_src = x->plane[0].src;
+        struct buf_2d orig_pre[2];
 
-      ta_s = (ENTROPY_CONTEXT *)&t_above_s;
-      tl_s = (ENTROPY_CONTEXT *)&t_left_s;
+        vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre));
 
-      // motion search for newmv (single predictor case only)
-      if (mbmi->second_ref_frame <= 0 && this_mode == NEW4X4) {
-        int sseshift, n;
-        int step_param = 0;
-        int further_steps;
-        int thissme, bestsme = INT_MAX;
-        BLOCK *c;
-        BLOCKD *e;
+        vpx_memcpy(t_above_s, t_above, sizeof(t_above_s));
+        vpx_memcpy(t_left_s, t_left, sizeof(t_left_s));
 
-        /* Is the best so far sufficiently good that we cant justify doing
-         * and new motion search. */
-        if (best_label_rd < label_mv_thresh)
-          break;
+        // motion search for newmv (single predictor case only)
+        if (mbmi->ref_frame[1] <= 0 && this_mode == NEWMV) {
+          int step_param = 0;
+          int further_steps;
+          int thissme, bestsme = INT_MAX;
+          int sadpb = x->sadperbit4;
+          int_mv mvp_full;
 
-        if (cpi->compressor_speed) {
-          if (segmentation == PARTITIONING_8X16 ||
-              segmentation == PARTITIONING_16X8) {
-            bsi->mvp.as_int = bsi->sv_mvp[i].as_int;
-            if (i == 1 && segmentation == PARTITIONING_16X8)
-              bsi->mvp.as_int = bsi->sv_mvp[2].as_int;
+          /* Is the best so far sufficiently good that we cant justify doing
+           * and new motion search. */
+          if (best_label_rd < label_mv_thresh)
+            break;
 
-            step_param = bsi->sv_istep[i];
+          if (cpi->compressor_speed) {
+            // use previous block's result as next block's MV predictor.
+            if (i > 0) {
+              bsi->mvp.as_int =
+              x->e_mbd.mode_info_context->bmi[i - 1].as_mv[0].as_int;
+              if (i == 2)
+                bsi->mvp.as_int =
+                x->e_mbd.mode_info_context->bmi[i - 2].as_mv[0].as_int;
+              step_param = 2;
+            }
           }
 
-          // use previous block's result as next block's MV predictor.
-          if (segmentation == PARTITIONING_4X4 && i > 0) {
-            bsi->mvp.as_int = x->e_mbd.block[i - 1].bmi.as_mv[0].as_int;
-            if (i == 4 || i == 8 || i == 12)
-              bsi->mvp.as_int = x->e_mbd.block[i - 4].bmi.as_mv[0].as_int;
-            step_param = 2;
-          }
-        }
+          further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
 
-        further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
-
-        {
-          int sadpb = x->sadperbit4;
-          int_mv mvp_full;
-
           mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
           mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
 
-          // find first label
-          n = vp9_mbsplit_offset[segmentation][i];
-
-          c = &x->block[n];
-          e = &x->e_mbd.block[n];
-
-          bestsme = vp9_full_pixel_diamond(cpi, x, c, e, &mvp_full, step_param,
+          // adjust src pointer for this block
+          mi_buf_shift(x, i);
+          bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
                                            sadpb, further_steps, 0, v_fn_ptr,
-                                           bsi->ref_mv, &mode_mv[NEW4X4]);
+                                           bsi->ref_mv, &mode_mv[NEWMV]);
 
-          sseshift = segmentation_to_sseshift[segmentation];
-
           // Should we do a full search (best quality only)
-          if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) {
+          if (cpi->compressor_speed == 0) {
             /* Check if mvp_full is within the range. */
             clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
                      x->mv_row_min, x->mv_row_max);
 
-            thissme = cpi->full_search_sad(x, c, e, &mvp_full,
+            thissme = cpi->full_search_sad(x, &mvp_full,
                                            sadpb, 16, v_fn_ptr,
                                            x->nmvjointcost, x->mvcost,
-                                           bsi->ref_mv);
+                                           bsi->ref_mv, i);
 
             if (thissme < bestsme) {
               bestsme = thissme;
-              mode_mv[NEW4X4].as_int = e->bmi.as_mv[0].as_int;
+              mode_mv[NEWMV].as_int =
+                  x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int;
             } else {
               /* The full search result is actually worse so re-instate the
                * previous best vector */
-              e->bmi.as_mv[0].as_int = mode_mv[NEW4X4].as_int;
+              x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int =
+                  mode_mv[NEWMV].as_int;
             }
           }
-        }
 
-        if (bestsme < INT_MAX) {
-          int distortion;
-          unsigned int sse;
-          cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],
-                                       bsi->ref_mv, x->errorperbit, v_fn_ptr,
-                                       x->nmvjointcost, x->mvcost,
-                                       &distortion, &sse);
+          if (bestsme < INT_MAX) {
+            int distortion;
+            unsigned int sse;
+            cpi->find_fractional_mv_step(x, &mode_mv[NEWMV],
+                                         bsi->ref_mv, x->errorperbit, v_fn_ptr,
+                                         x->nmvjointcost, x->mvcost,
+                                         &distortion, &sse);
 
-          // safe motion search result for use in compound prediction
-          seg_mvs[i][mbmi->ref_frame - 1].as_int = mode_mv[NEW4X4].as_int;
+            // safe motion search result for use in compound prediction
+            seg_mvs[i][mbmi->ref_frame[0]].as_int = mode_mv[NEWMV].as_int;
+          }
+
+          // restore src pointers
+          mi_buf_restore(x, orig_src, orig_pre);
+        } else if (mbmi->ref_frame[1] > 0 && this_mode == NEWMV) {
+          if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
+              seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
+            continue;
+
+          // adjust src pointers
+          mi_buf_shift(x, i);
+          if (cpi->sf.comp_inter_joint_search_thresh < bsize) {
+            iterative_motion_search(cpi, x, bsize, frame_mv[this_mode],
+                                    scaled_ref_frame,
+                                    mi_row, mi_col, seg_mvs[i]);
+            seg_mvs[i][mbmi->ref_frame[0]].as_int =
+                frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
+            seg_mvs[i][mbmi->ref_frame[1]].as_int =
+                frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
+          }
+          // restore src pointers
+          mi_buf_restore(x, orig_src, orig_pre);
         }
-      } else if (mbmi->second_ref_frame > 0 && this_mode == NEW4X4) {
-        /* NEW4X4 */
-        /* motion search not completed? Then skip newmv for this block with
-         * comppred */
-        if (seg_mvs[i][mbmi->second_ref_frame - 1].as_int == INVALID_MV ||
-            seg_mvs[i][mbmi->ref_frame        - 1].as_int == INVALID_MV) {
+
+        rate = labels2mode(x, i, this_mode, &mode_mv[this_mode],
+                           &second_mode_mv[this_mode], frame_mv, seg_mvs[i],
+                           bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
+                           x->mvcost, cpi);
+
+        // Trap vectors that reach beyond the UMV borders
+        if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||
+            ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+            ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||
+            ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {
           continue;
         }
-      }
+        if (mbmi->ref_frame[1] > 0 &&
+            mv_check_bounds(x, &second_mode_mv[this_mode]))
+          continue;
 
-      rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],
-                         &second_mode_mv[this_mode], seg_mvs[i],
-                         bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
-                         x->mvcost);
-
-      // Trap vectors that reach beyond the UMV borders
-      if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||
-          ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
-          ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||
-          ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {
-        continue;
-      }
-      if (mbmi->second_ref_frame > 0 &&
-          mv_check_bounds(x, &second_mode_mv[this_mode]))
-        continue;
-
-      if (segmentation == PARTITIONING_4X4) {
         this_rd = encode_inter_mb_segment(&cpi->common,
-                                          x, labels, i, &labelyrate,
-                                          &distortion, ta_s, tl_s);
-        other_rd = this_rd;
-      } else {
-        this_rd = encode_inter_mb_segment_8x8(&cpi->common,
-                                              x, labels, i, &labelyrate,
-                                              &distortion, &other_rd,
-                                              ta_s, tl_s);
-      }
-      this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
-      rate += labelyrate;
+                                          x, i, &labelyrate,
+                                          &distortion, t_above_s, t_left_s);
+        this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
+        rate += labelyrate;
 
-      if (this_rd < best_label_rd) {
-        sbr = rate;
-        sbd = distortion;
-        bestlabelyrate = labelyrate;
-        mode_selected = this_mode;
-        best_label_rd = this_rd;
-        if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_4X4) {
-          for (j = 0; j < 16; j++)
-            if (labels[j] == i)
-              best_eobs[j] = x->e_mbd.eobs[j];
-        } else {
-          for (j = 0; j < 4; j++) {
-            int ib = vp9_i8x8_block[j], idx = j * 4;
-
-            if (labels[ib] == i)
-              best_eobs[idx] = x->e_mbd.eobs[idx];
-          }
+        if (this_rd < best_label_rd) {
+          sbr = rate;
+          sbd = distortion;
+          bestlabelyrate = labelyrate;
+          mode_selected = this_mode;
+          best_label_rd = this_rd;
+          best_eobs[i] = x->e_mbd.plane[0].eobs[i];
+          vpx_memcpy(t_above_b, t_above_s, sizeof(t_above_s));
+          vpx_memcpy(t_left_b, t_left_s, sizeof(t_left_s));
         }
-        if (other_rd < best_other_rd)
-          best_other_rd = other_rd;
+      } /*for each 4x4 mode*/
 
-        vpx_memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
-        vpx_memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
+      vpx_memcpy(t_above, t_above_b, sizeof(t_above));
+      vpx_memcpy(t_left, t_left_b, sizeof(t_left));
 
-      }
-    } /*for each 4x4 mode*/
+      labels2mode(x, i, mode_selected, &mode_mv[mode_selected],
+                  &second_mode_mv[mode_selected], frame_mv, seg_mvs[i],
+                  bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
+                  x->mvcost, cpi);
 
-    vpx_memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
+      br += sbr;
+      bd += sbd;
+      segmentyrate += bestlabelyrate;
+      this_segment_rd += best_label_rd;
+      other_segment_rd += best_other_rd;
 
-    labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
-                &second_mode_mv[mode_selected], seg_mvs[i],
-                bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost, x->mvcost);
-
-    br += sbr;
-    bd += sbd;
-    segmentyrate += bestlabelyrate;
-    this_segment_rd += best_label_rd;
-    other_segment_rd += best_other_rd;
-    if (rds)
-      rds[i] = this_segment_rd;
-    if (otherrds)
-      otherrds[i] = other_segment_rd;
+      for (j = 1; j < bh; ++j)
+        vpx_memcpy(&x->partition_info->bmi[i + j * 2],
+                   &x->partition_info->bmi[i],
+                   sizeof(x->partition_info->bmi[i]));
+      for (j = 1; j < bw; ++j)
+        vpx_memcpy(&x->partition_info->bmi[i + j],
+                   &x->partition_info->bmi[i],
+                   sizeof(x->partition_info->bmi[i]));
+    }
   } /* for each label */
 
   if (this_segment_rd < bsi->segment_rd) {
@@ -2887,152 +1507,33 @@
     bsi->d = bd;
     bsi->segment_yrate = segmentyrate;
     bsi->segment_rd = this_segment_rd;
-    bsi->segment_num = segmentation;
-    bsi->txfm_size = mbmi->txfm_size;
 
     // store everything needed to come back to this!!
-    for (i = 0; i < 16; i++) {
+    for (i = 0; i < 4; i++) {
       bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;
-      if (mbmi->second_ref_frame > 0)
+      if (mbmi->ref_frame[1] > 0)
         bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv;
       bsi->modes[i] = x->partition_info->bmi[i].mode;
       bsi->eobs[i] = best_eobs[i];
     }
   }
-
-  if (completed) {
-    *completed = i;
-  }
 }
 
-static void rd_check_segment(VP9_COMP *cpi, MACROBLOCK *x,
-                             BEST_SEG_INFO *bsi,
-                             unsigned int segmentation,
-                             /* 16 = n_blocks */
-                             int_mv seg_mvs[16][MAX_REF_FRAMES - 1],
-                             int64_t txfm_cache[NB_TXFM_MODES]) {
-  int i, n, c = vp9_mbsplit_count[segmentation];
-
-  if (segmentation == PARTITIONING_4X4) {
-    int64_t rd[16];
-
-    rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, NULL,
-                            rd, &n, seg_mvs);
-    if (n == c) {
-      for (i = 0; i < NB_TXFM_MODES; i++) {
-        if (rd[c - 1] < txfm_cache[i])
-          txfm_cache[i] = rd[c - 1];
-      }
-    }
-  } else {
-    int64_t diff, base_rd;
-    int cost4x4 = vp9_cost_bit(cpi->common.prob_tx[0], 0);
-    int cost8x8 = vp9_cost_bit(cpi->common.prob_tx[0], 1);
-
-    if (cpi->common.txfm_mode == TX_MODE_SELECT) {
-      int64_t rd4x4[4], rd8x8[4];
-      int n4x4, n8x8, nmin;
-      BEST_SEG_INFO bsi4x4, bsi8x8;
-
-      /* factor in cost of cost4x4/8x8 in decision */
-      vpx_memcpy(&bsi4x4, bsi, sizeof(*bsi));
-      vpx_memcpy(&bsi8x8, bsi, sizeof(*bsi));
-      rd_check_segment_txsize(cpi, x, &bsi4x4, segmentation,
-                              TX_4X4, NULL, rd4x4, &n4x4, seg_mvs);
-      rd_check_segment_txsize(cpi, x, &bsi8x8, segmentation,
-                              TX_8X8, NULL, rd8x8, &n8x8, seg_mvs);
-      if (bsi4x4.segment_num == segmentation) {
-        bsi4x4.segment_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);
-        if (bsi4x4.segment_rd < bsi->segment_rd)
-          vpx_memcpy(bsi, &bsi4x4, sizeof(*bsi));
-      }
-      if (bsi8x8.segment_num == segmentation) {
-        bsi8x8.segment_rd += RDCOST(x->rdmult, x->rddiv, cost8x8, 0);
-        if (bsi8x8.segment_rd < bsi->segment_rd)
-          vpx_memcpy(bsi, &bsi8x8, sizeof(*bsi));
-      }
-      n = n4x4 > n8x8 ? n4x4 : n8x8;
-      if (n == c) {
-        nmin = n4x4 < n8x8 ? n4x4 : n8x8;
-        diff = rd8x8[nmin - 1] - rd4x4[nmin - 1];
-        if (n == n4x4) {
-          base_rd = rd4x4[c - 1];
-        } else {
-          base_rd = rd8x8[c - 1] - diff;
-        }
-      }
-    } else {
-      int64_t rd[4], otherrd[4];
-
-      if (cpi->common.txfm_mode == ONLY_4X4) {
-        rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, otherrd,
-                                rd, &n, seg_mvs);
-        if (n == c) {
-          base_rd = rd[c - 1];
-          diff = otherrd[c - 1] - rd[c - 1];
-        }
-      } else /* use 8x8 transform */ {
-        rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_8X8, otherrd,
-                                rd, &n, seg_mvs);
-        if (n == c) {
-          diff = rd[c - 1] - otherrd[c - 1];
-          base_rd = otherrd[c - 1];
-        }
-      }
-    }
-
-    if (n == c) {
-      if (base_rd < txfm_cache[ONLY_4X4]) {
-        txfm_cache[ONLY_4X4] = base_rd;
-      }
-      if (base_rd + diff < txfm_cache[ALLOW_8X8]) {
-        txfm_cache[ALLOW_8X8] = txfm_cache[ALLOW_16X16] =
-            txfm_cache[ALLOW_32X32] = base_rd + diff;
-      }
-      if (diff < 0) {
-        base_rd += diff + RDCOST(x->rdmult, x->rddiv, cost8x8, 0);
-      } else {
-        base_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);
-      }
-      if (base_rd < txfm_cache[TX_MODE_SELECT]) {
-        txfm_cache[TX_MODE_SELECT] = base_rd;
-      }
-    }
-  }
-}
-
-static INLINE void cal_step_param(int sr, int *sp) {
-  int step = 0;
-
-  if (sr > MAX_FIRST_STEP) sr = MAX_FIRST_STEP;
-  else if (sr < 1) sr = 1;
-
-  while (sr >>= 1)
-    step++;
-
-  *sp = MAX_MVSEARCH_STEPS - 1 - step;
-}
-
 static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
                                        int_mv *best_ref_mv,
                                        int_mv *second_best_ref_mv,
                                        int64_t best_rd,
-                                       int *mdcounts,
                                        int *returntotrate,
                                        int *returnyrate,
                                        int *returndistortion,
                                        int *skippable, int mvthresh,
-                                       int_mv seg_mvs[NB_PARTITIONINGS]
-                                                     [16 /* n_blocks */]
-                                                     [MAX_REF_FRAMES - 1],
-                                       int64_t txfm_cache[NB_TXFM_MODES]) {
+                                       int_mv seg_mvs[4][MAX_REF_FRAMES],
+                                       int mi_row, int mi_col) {
   int i;
   BEST_SEG_INFO bsi;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
 
   vpx_memset(&bsi, 0, sizeof(bsi));
-  for (i = 0; i < NB_TXFM_MODES; i++)
-    txfm_cache[i] = INT64_MAX;
 
   bsi.segment_rd = best_rd;
   bsi.ref_mv = best_ref_mv;
@@ -3039,126 +1540,43 @@
   bsi.second_ref_mv = second_best_ref_mv;
   bsi.mvp.as_int = best_ref_mv->as_int;
   bsi.mvthresh = mvthresh;
-  bsi.mdcounts = mdcounts;
-  bsi.txfm_size = TX_4X4;
 
-  for (i = 0; i < 16; i++)
-    bsi.modes[i] = ZERO4X4;
+  for (i = 0; i < 4; i++)
+    bsi.modes[i] = ZEROMV;
 
-  if (cpi->compressor_speed == 0) {
-    /* for now, we will keep the original segmentation order
-       when in best quality mode */
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,
-                     seg_mvs[PARTITIONING_16X8], txfm_cache);
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,
-                     seg_mvs[PARTITIONING_8X16], txfm_cache);
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,
-                     seg_mvs[PARTITIONING_8X8], txfm_cache);
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,
-                     seg_mvs[PARTITIONING_4X4], txfm_cache);
-  } else {
-    int sr;
+  rd_check_segment_txsize(cpi, x, &bsi, seg_mvs, mi_row, mi_col);
 
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,
-                     seg_mvs[PARTITIONING_8X8], txfm_cache);
-
-    if (bsi.segment_rd < best_rd) {
-      int tmp_col_min = x->mv_col_min;
-      int tmp_col_max = x->mv_col_max;
-      int tmp_row_min = x->mv_row_min;
-      int tmp_row_max = x->mv_row_max;
-
-      vp9_clamp_mv_min_max(x, best_ref_mv);
-
-      /* Get 8x8 result */
-      bsi.sv_mvp[0].as_int = bsi.mvs[0].as_int;
-      bsi.sv_mvp[1].as_int = bsi.mvs[2].as_int;
-      bsi.sv_mvp[2].as_int = bsi.mvs[8].as_int;
-      bsi.sv_mvp[3].as_int = bsi.mvs[10].as_int;
-
-      /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range
-       * according to the closeness of 2 MV. */
-      /* block 8X16 */
-      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col)) >> 3);
-      cal_step_param(sr, &bsi.sv_istep[0]);
-
-      sr = MAXF((abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
-      cal_step_param(sr, &bsi.sv_istep[1]);
-
-      rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,
-                       seg_mvs[PARTITIONING_8X16], txfm_cache);
-
-      /* block 16X8 */
-      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col)) >> 3);
-      cal_step_param(sr, &bsi.sv_istep[0]);
-
-      sr = MAXF((abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
-      cal_step_param(sr, &bsi.sv_istep[1]);
-
-      rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,
-                       seg_mvs[PARTITIONING_16X8], txfm_cache);
-
-      /* If 8x8 is better than 16x8/8x16, then do 4x4 search */
-      /* Not skip 4x4 if speed=0 (good quality) */
-      if (cpi->sf.no_skip_block4x4_search ||
-          bsi.segment_num == PARTITIONING_8X8) {
-        /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */
-        bsi.mvp.as_int = bsi.sv_mvp[0].as_int;
-        rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,
-                         seg_mvs[PARTITIONING_4X4], txfm_cache);
-      }
-
-      /* restore UMV window */
-      x->mv_col_min = tmp_col_min;
-      x->mv_col_max = tmp_col_max;
-      x->mv_row_min = tmp_row_min;
-      x->mv_row_max = tmp_row_max;
-    }
-  }
-
   /* set it to the best */
-  for (i = 0; i < 16; i++) {
-    BLOCKD *bd = &x->e_mbd.block[i];
-
-    bd->bmi.as_mv[0].as_int = bsi.mvs[i].as_int;
-    if (mbmi->second_ref_frame > 0)
-      bd->bmi.as_mv[1].as_int = bsi.second_mvs[i].as_int;
-    x->e_mbd.eobs[i] = bsi.eobs[i];
+  for (i = 0; i < 4; i++) {
+    x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int = bsi.mvs[i].as_int;
+    if (mbmi->ref_frame[1] > 0)
+      x->e_mbd.mode_info_context->bmi[i].as_mv[1].as_int =
+      bsi.second_mvs[i].as_int;
+    x->e_mbd.plane[0].eobs[i] = bsi.eobs[i];
   }
 
-  *returntotrate = bsi.r;
-  *returndistortion = bsi.d;
-  *returnyrate = bsi.segment_yrate;
-  *skippable = bsi.txfm_size == TX_4X4 ?
-                    vp9_mby_is_skippable_4x4(&x->e_mbd) :
-                    vp9_mby_is_skippable_8x8(&x->e_mbd);
-
   /* save partitions */
-  mbmi->txfm_size = bsi.txfm_size;
-  mbmi->partitioning = bsi.segment_num;
-  x->partition_info->count = vp9_mbsplit_count[bsi.segment_num];
+  x->partition_info->count = 4;
 
   for (i = 0; i < x->partition_info->count; i++) {
-    int j;
-
-    j = vp9_mbsplit_offset[bsi.segment_num][i];
-
-    x->partition_info->bmi[i].mode = bsi.modes[j];
-    x->partition_info->bmi[i].mv.as_mv = bsi.mvs[j].as_mv;
-    if (mbmi->second_ref_frame > 0)
-      x->partition_info->bmi[i].second_mv.as_mv = bsi.second_mvs[j].as_mv;
+    x->partition_info->bmi[i].mode = bsi.modes[i];
+    x->partition_info->bmi[i].mv.as_mv = bsi.mvs[i].as_mv;
+    if (mbmi->ref_frame[1] > 0)
+      x->partition_info->bmi[i].second_mv.as_mv = bsi.second_mvs[i].as_mv;
   }
   /*
    * used to set mbmi->mv.as_int
    */
-  x->partition_info->bmi[15].mv.as_int = bsi.mvs[15].as_int;
-  if (mbmi->second_ref_frame > 0)
-    x->partition_info->bmi[15].second_mv.as_int = bsi.second_mvs[15].as_int;
+  x->partition_info->bmi[3].mv.as_int = bsi.mvs[3].as_int;
+  if (mbmi->ref_frame[1] > 0)
+    x->partition_info->bmi[3].second_mv.as_int = bsi.second_mvs[3].as_int;
 
+  *returntotrate = bsi.r;
+  *returndistortion = bsi.d;
+  *returnyrate = bsi.segment_yrate;
+  *skippable = vp9_sby_is_skippable(&x->e_mbd, BLOCK_SIZE_SB8X8);
+  mbmi->mode = bsi.modes[3];
+
   return (int)(bsi.segment_rd);
 }
 
@@ -3169,18 +1587,17 @@
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   int_mv this_mv;
   int i;
-  int zero_seen = FALSE;
+  int zero_seen = 0;
   int best_index = 0;
   int best_sad = INT_MAX;
   int this_sad = INT_MAX;
 
-  BLOCK *b = &x->block[0];
-  uint8_t *src_y_ptr = *(b->base_src);
+  uint8_t *src_y_ptr = x->plane[0].src.buf;
   uint8_t *ref_y_ptr;
   int row_offset, col_offset;
 
   // Get the sad for each candidate reference mv
-  for (i = 0; i < 4; i++) {
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; i++) {
     this_mv.as_int = mbmi->ref_mvs[ref_frame][i].as_int;
 
     // The list is at an end if we see 0 for a second time.
@@ -3193,7 +1610,7 @@
     ref_y_ptr = ref_y_buffer + (ref_y_stride * row_offset) + col_offset;
 
     // Find sad for current vector.
-    this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, b->src_stride,
+    this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
                                            ref_y_ptr, ref_y_stride,
                                            0x7fffffff);
 
@@ -3208,150 +1625,64 @@
   x->mv_best_ref_index[ref_frame] = best_index;
 }
 
-static void set_i8x8_block_modes(MACROBLOCK *x, int modes[4]) {
-  int i;
-  MACROBLOCKD *xd = &x->e_mbd;
-  for (i = 0; i < 4; i++) {
-    int ib = vp9_i8x8_block[i];
-    xd->mode_info_context->bmi[ib + 0].as_mode.first = modes[i];
-    xd->mode_info_context->bmi[ib + 1].as_mode.first = modes[i];
-    xd->mode_info_context->bmi[ib + 4].as_mode.first = modes[i];
-    xd->mode_info_context->bmi[ib + 5].as_mode.first = modes[i];
-    // printf("%d,%d,%d,%d\n",
-    //       modes[0], modes[1], modes[2], modes[3]);
-  }
-
-  for (i = 0; i < 16; i++) {
-    xd->block[i].bmi = xd->mode_info_context->bmi[i];
-  }
-}
-
-extern void vp9_calc_ref_probs(int *count, vp9_prob *probs);
-static void estimate_curframe_refprobs(VP9_COMP *cpi, vp9_prob mod_refprobs[3], int pred_ref) {
-  int norm_cnt[MAX_REF_FRAMES];
-  const int *const rfct = cpi->count_mb_ref_frame_usage;
-  int intra_count = rfct[INTRA_FRAME];
-  int last_count  = rfct[LAST_FRAME];
-  int gf_count    = rfct[GOLDEN_FRAME];
-  int arf_count   = rfct[ALTREF_FRAME];
-
-  // Work out modified reference frame probabilities to use where prediction
-  // of the reference frame fails
-  if (pred_ref == INTRA_FRAME) {
-    norm_cnt[0] = 0;
-    norm_cnt[1] = last_count;
-    norm_cnt[2] = gf_count;
-    norm_cnt[3] = arf_count;
-    vp9_calc_ref_probs(norm_cnt, mod_refprobs);
-    mod_refprobs[0] = 0;    // This branch implicit
-  } else if (pred_ref == LAST_FRAME) {
-    norm_cnt[0] = intra_count;
-    norm_cnt[1] = 0;
-    norm_cnt[2] = gf_count;
-    norm_cnt[3] = arf_count;
-    vp9_calc_ref_probs(norm_cnt, mod_refprobs);
-    mod_refprobs[1] = 0;    // This branch implicit
-  } else if (pred_ref == GOLDEN_FRAME) {
-    norm_cnt[0] = intra_count;
-    norm_cnt[1] = last_count;
-    norm_cnt[2] = 0;
-    norm_cnt[3] = arf_count;
-    vp9_calc_ref_probs(norm_cnt, mod_refprobs);
-    mod_refprobs[2] = 0;  // This branch implicit
+static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
+                                     unsigned int *ref_costs_single,
+                                     unsigned int *ref_costs_comp,
+                                     vp9_prob *comp_mode_p) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  int seg_ref_active = vp9_segfeature_active(xd, segment_id,
+                                             SEG_LVL_REF_FRAME);
+  if (seg_ref_active) {
+    vpx_memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
+    vpx_memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
+    *comp_mode_p = 128;
   } else {
-    norm_cnt[0] = intra_count;
-    norm_cnt[1] = last_count;
-    norm_cnt[2] = gf_count;
-    norm_cnt[3] = 0;
-    vp9_calc_ref_probs(norm_cnt, mod_refprobs);
-    mod_refprobs[2] = 0;  // This branch implicit
-  }
-}
+    vp9_prob intra_inter_p = vp9_get_pred_prob(cm, xd, PRED_INTRA_INTER);
+    vp9_prob comp_inter_p = 128;
 
-static INLINE unsigned weighted_cost(vp9_prob *tab0, vp9_prob *tab1,
-                                     int idx, int val, int weight) {
-  unsigned cost0 = tab0[idx] ? vp9_cost_bit(tab0[idx], val) : 0;
-  unsigned cost1 = tab1[idx] ? vp9_cost_bit(tab1[idx], val) : 0;
-  // weight is 16-bit fixed point, so this basically calculates:
-  // 0.5 + weight * cost1 + (1.0 - weight) * cost0
-  return (0x8000 + weight * cost1 + (0x10000 - weight) * cost0) >> 16;
-}
+    if (cm->comp_pred_mode == HYBRID_PREDICTION) {
+      comp_inter_p = vp9_get_pred_prob(cm, xd, PRED_COMP_INTER_INTER);
+      *comp_mode_p = comp_inter_p;
+    } else {
+      *comp_mode_p = 128;
+    }
 
-static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, unsigned int *ref_costs) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  vp9_prob *mod_refprobs;
+    ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
 
-  unsigned int cost;
-  int pred_ref;
-  int pred_flag;
-  int pred_ctx;
-  int i;
+    if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
+      vp9_prob ref_single_p1 = vp9_get_pred_prob(cm, xd, PRED_SINGLE_REF_P1);
+      vp9_prob ref_single_p2 = vp9_get_pred_prob(cm, xd, PRED_SINGLE_REF_P2);
+      unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
 
-  vp9_prob pred_prob, new_pred_prob;
-  int seg_ref_active;
-  int seg_ref_count = 0;
-  seg_ref_active = vp9_segfeature_active(xd,
-                                         segment_id,
-                                         SEG_LVL_REF_FRAME);
+      if (cm->comp_pred_mode == HYBRID_PREDICTION)
+        base_cost += vp9_cost_bit(comp_inter_p, 0);
 
-  if (seg_ref_active) {
-    seg_ref_count = vp9_check_segref(xd, segment_id, INTRA_FRAME)  +
-                    vp9_check_segref(xd, segment_id, LAST_FRAME)   +
-                    vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +
-                    vp9_check_segref(xd, segment_id, ALTREF_FRAME);
-  }
-
-  // Get the predicted reference for this mb
-  pred_ref = vp9_get_pred_ref(cm, xd);
-
-  // Get the context probability for the prediction flag (based on last frame)
-  pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);
-
-  // Predict probability for current frame based on stats so far
-  pred_ctx = vp9_get_pred_context(cm, xd, PRED_REF);
-  new_pred_prob = get_binary_prob(cpi->ref_pred_count[pred_ctx][0],
-                                  cpi->ref_pred_count[pred_ctx][1]);
-
-  // Get the set of probabilities to use if prediction fails
-  mod_refprobs = cm->mod_refprobs[pred_ref];
-
-  // For each possible selected reference frame work out a cost.
-  for (i = 0; i < MAX_REF_FRAMES; i++) {
-    if (seg_ref_active && seg_ref_count == 1) {
-      cost = 0;
+      ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =
+          ref_costs_single[ALTREF_FRAME] = base_cost;
+      ref_costs_single[LAST_FRAME]   += vp9_cost_bit(ref_single_p1, 0);
+      ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+      ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+      ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
+      ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
     } else {
-      pred_flag = (i == pred_ref);
+      ref_costs_single[LAST_FRAME]   = 512;
+      ref_costs_single[GOLDEN_FRAME] = 512;
+      ref_costs_single[ALTREF_FRAME] = 512;
+    }
+    if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) {
+      vp9_prob ref_comp_p = vp9_get_pred_prob(cm, xd, PRED_COMP_REF_P);
+      unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
 
-      // Get the prediction for the current mb
-      cost = weighted_cost(&pred_prob, &new_pred_prob, 0,
-                           pred_flag, cpi->seg0_progress);
-      if (cost > 1024) cost = 768; // i.e. account for 4 bits max.
+      if (cm->comp_pred_mode == HYBRID_PREDICTION)
+        base_cost += vp9_cost_bit(comp_inter_p, 1);
 
-      // for incorrectly predicted cases
-      if (! pred_flag) {
-        vp9_prob curframe_mod_refprobs[3];
-
-        if (cpi->seg0_progress) {
-          estimate_curframe_refprobs(cpi, curframe_mod_refprobs, pred_ref);
-        } else {
-          vpx_memset(curframe_mod_refprobs, 0, sizeof(curframe_mod_refprobs));
-        }
-
-        cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 0,
-                              (i != INTRA_FRAME), cpi->seg0_progress);
-        if (i != INTRA_FRAME) {
-          cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 1,
-                                (i != LAST_FRAME), cpi->seg0_progress);
-          if (i != LAST_FRAME) {
-            cost += weighted_cost(mod_refprobs, curframe_mod_refprobs, 2,
-                                  (i != GOLDEN_FRAME), cpi->seg0_progress);
-          }
-        }
-      }
+      ref_costs_comp[LAST_FRAME]   = base_cost + vp9_cost_bit(ref_comp_p, 0);
+      ref_costs_comp[GOLDEN_FRAME] = base_cost + vp9_cost_bit(ref_comp_p, 1);
+    } else {
+      ref_costs_comp[LAST_FRAME]   = 512;
+      ref_costs_comp[GOLDEN_FRAME] = 512;
     }
-
-    ref_costs[i] = cost;
   }
 }
 
@@ -3368,11 +1699,11 @@
   // restored if we decide to encode this way
   ctx->skip = x->skip;
   ctx->best_mode_index = mode_index;
-  vpx_memcpy(&ctx->mic, xd->mode_info_context,
-             sizeof(MODE_INFO));
+  ctx->mic = *xd->mode_info_context;
+
   if (partition)
-    vpx_memcpy(&ctx->partition_info, partition,
-               sizeof(PARTITION_INFO));
+    ctx->partition_info = *partition;
+
   ctx->best_ref_mv.as_int = ref_mv->as_int;
   ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
 
@@ -3383,82 +1714,69 @@
   memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));
 }
 
-static void inter_mode_cost(VP9_COMP *cpi, MACROBLOCK *x,
-                            int *rate2, int *distortion2, int *rate_y,
-                            int *distortion, int* rate_uv, int *distortion_uv,
-                            int *skippable, int64_t txfm_cache[NB_TXFM_MODES]) {
-  int y_skippable, uv_skippable;
+static void setup_pred_block(const MACROBLOCKD *xd,
+                             struct buf_2d dst[MAX_MB_PLANE],
+                             const YV12_BUFFER_CONFIG *src,
+                             int mi_row, int mi_col,
+                             const struct scale_factors *scale,
+                             const struct scale_factors *scale_uv) {
+  int i;
 
-  // Y cost and distortion
-  macro_block_yrd(cpi, x, rate_y, distortion, &y_skippable, txfm_cache);
+  dst[0].buf = src->y_buffer;
+  dst[0].stride = src->y_stride;
+  dst[1].buf = src->u_buffer;
+  dst[2].buf = src->v_buffer;
+  dst[1].stride = dst[2].stride = src->uv_stride;
+#if CONFIG_ALPHA
+  dst[3].buf = src->alpha_buffer;
+  dst[3].stride = src->alpha_stride;
+#endif
 
-  *rate2 += *rate_y;
-  *distortion2 += *distortion;
-
-  // UV cost and distortion
-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                    x->e_mbd.predictor, x->src.uv_stride);
-  if (x->e_mbd.mode_info_context->mbmi.txfm_size != TX_4X4 &&
-      x->e_mbd.mode_info_context->mbmi.mode != I8X8_PRED &&
-      x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
-    rd_inter16x16_uv_8x8(cpi, x, rate_uv, distortion_uv,
-                         cpi->common.full_pixel, &uv_skippable, 1);
-  else
-    rd_inter16x16_uv_4x4(cpi, x, rate_uv, distortion_uv,
-                         cpi->common.full_pixel, &uv_skippable, 1);
-
-  *rate2 += *rate_uv;
-  *distortion2 += *distortion_uv;
-  *skippable = y_skippable && uv_skippable;
+  // TODO(jkoleszar): Make scale factors per-plane data
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col,
+                     i ? scale_uv : scale,
+                     xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
+  }
 }
 
 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                                int idx, MV_REFERENCE_FRAME frame_type,
-                               int block_size,
-                               int mb_row, int mb_col,
+                               enum BlockSize block_size,
+                               int mi_row, int mi_col,
                                int_mv frame_nearest_mv[MAX_REF_FRAMES],
                                int_mv frame_near_mv[MAX_REF_FRAMES],
-                               int frame_mdcounts[4][4],
-                               YV12_BUFFER_CONFIG yv12_mb[4],
+                               struct buf_2d yv12_mb[4][MAX_MB_PLANE],
                                struct scale_factors scale[MAX_REF_FRAMES]) {
   VP9_COMMON *cm = &cpi->common;
   YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]];
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  int use_prev_in_find_mv_refs, use_prev_in_find_best_ref;
 
   // set up scaling factors
   scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1];
+
   scale[frame_type].x_offset_q4 =
-      (mb_col * 16 * scale[frame_type].x_num / scale[frame_type].x_den) & 0xf;
+      ROUND_POWER_OF_TWO(mi_col * MI_SIZE * scale[frame_type].x_scale_fp,
+       VP9_REF_SCALE_SHIFT) & 0xf;
   scale[frame_type].y_offset_q4 =
-      (mb_row * 16 * scale[frame_type].y_num / scale[frame_type].y_den) & 0xf;
+      ROUND_POWER_OF_TWO(mi_row * MI_SIZE * scale[frame_type].y_scale_fp,
+       VP9_REF_SCALE_SHIFT) & 0xf;
 
   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
   // use the UV scaling factors.
-  setup_pred_block(&yv12_mb[frame_type], yv12, mb_row, mb_col,
+  setup_pred_block(xd, yv12_mb[frame_type], yv12, mi_row, mi_col,
                    &scale[frame_type], &scale[frame_type]);
 
   // Gets an initial list of candidate vectors from neighbours and orders them
-  use_prev_in_find_mv_refs = cm->width == cm->last_width &&
-                             cm->height == cm->last_height &&
-                             !cpi->common.error_resilient_mode;
   vp9_find_mv_refs(&cpi->common, xd, xd->mode_info_context,
-                   use_prev_in_find_mv_refs ? xd->prev_mode_info_context : NULL,
+                   xd->prev_mode_info_context,
                    frame_type,
                    mbmi->ref_mvs[frame_type],
                    cpi->common.ref_frame_sign_bias);
 
   // Candidate refinement carried out at encoder and decoder
-  use_prev_in_find_best_ref =
-      scale[frame_type].x_num == scale[frame_type].x_den &&
-      scale[frame_type].y_num == scale[frame_type].y_den &&
-      !cm->error_resilient_mode &&
-      !cm->frame_parallel_decoding_mode;
   vp9_find_best_ref_mvs(xd,
-                        use_prev_in_find_best_ref ?
-                            yv12_mb[frame_type].y_buffer : NULL,
-                        yv12->y_stride,
                         mbmi->ref_mvs[frame_type],
                         &frame_nearest_mv[frame_type],
                         &frame_near_mv[frame_type]);
@@ -3466,9 +1784,9 @@
   // Further refinement that is encode side only to test the top few candidates
   // in full and choose the best as the centre point for subsequent searches.
   // The current implementation doesn't support scaling.
-  if (scale[frame_type].x_num == scale[frame_type].x_den &&
-      scale[frame_type].y_num == scale[frame_type].y_den)
-    mv_pred(cpi, x, yv12_mb[frame_type].y_buffer, yv12->y_stride,
+  if (scale[frame_type].x_scale_fp == (1 << VP9_REF_SCALE_SHIFT) &&
+      scale[frame_type].y_scale_fp == (1 << VP9_REF_SCALE_SHIFT))
+    mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride,
             frame_type, block_size);
 }
 
@@ -3485,7 +1803,10 @@
   // TODO(debargha): Implement the functions by interpolating from a
   // look-up table
   vp9_clear_system_state();
-  {
+  if (var == 0 || n == 0) {
+    *rate = 0;
+    *dist = 0;
+  } else {
     double D, R;
     double s2 = (double) var / n;
     double s = sqrt(s2);
@@ -3515,44 +1836,229 @@
   vp9_clear_system_state();
 }
 
+static enum BlockSize get_plane_block_size(BLOCK_SIZE_TYPE bsize,
+                                           struct macroblockd_plane *pd) {
+  return get_block_size(plane_block_width(bsize, pd),
+                        plane_block_height(bsize, pd));
+}
+
+static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+                            MACROBLOCK *x, MACROBLOCKD *xd,
+                            int *out_rate_sum, int *out_dist_sum) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse, var;
+  int i, rate_sum = 0, dist_sum = 0;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    struct macroblock_plane *const p = &x->plane[i];
+    struct macroblockd_plane *const pd = &xd->plane[i];
+
+    // TODO(dkovalev) the same code in get_plane_block_size
+    const int bw = plane_block_width(bsize, pd);
+    const int bh = plane_block_height(bsize, pd);
+    const enum BlockSize bs = get_block_size(bw, bh);
+    int rate, dist;
+    var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+                             pd->dst.buf, pd->dst.stride, &sse);
+    model_rd_from_var_lapndz(var, bw * bh, pd->dequant[1] >> 3, &rate, &dist);
+
+    rate_sum += rate;
+    dist_sum += dist;
+  }
+
+  *out_rate_sum = rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
+static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+
+  const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
+  const int m = vp9_switchable_interp_map[mbmi->interp_filter];
+  return SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
+}
+
+static void iterative_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE_TYPE bsize,
+                                    int_mv *frame_mv,
+                                    YV12_BUFFER_CONFIG **scaled_ref_frame,
+                                    int mi_row, int mi_col,
+                                    int_mv single_newmv[MAX_REF_FRAMES]) {
+  int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  int refs[2] = { mbmi->ref_frame[0],
+                  (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+  int_mv ref_mv[2];
+  const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  int ite;
+  // Prediction buffer from second frame.
+  uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
+
+  // Do joint motion search in compound mode to get more accurate mv.
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+  struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
+  struct buf_2d scaled_first_yv12;
+  int last_besterr[2] = {INT_MAX, INT_MAX};
+
+  ref_mv[0] = mbmi->ref_mvs[refs[0]][0];
+  ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
+
+  if (scaled_ref_frame[0]) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[0];
+    setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
+                     NULL, NULL);
+  }
+
+  if (scaled_ref_frame[1]) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_second_yv12[i] = xd->plane[i].pre[1];
+
+    setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
+                     NULL, NULL);
+  }
+
+  xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
+                                          mi_row, mi_col);
+  xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
+                                          mi_row, mi_col);
+  scaled_first_yv12 = xd->plane[0].pre[0];
+
+  // Initialize mv using single prediction mode result.
+  frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+  frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+  // Allow joint search multiple times iteratively for each ref frame
+  // and break out the search loop if it couldn't find better mv.
+  for (ite = 0; ite < 4; ite++) {
+    struct buf_2d ref_yv12[2];
+    int bestsme = INT_MAX;
+    int sadpb = x->sadperbit16;
+    int_mv tmp_mv;
+    int search_range = 3;
+
+    int tmp_col_min = x->mv_col_min;
+    int tmp_col_max = x->mv_col_max;
+    int tmp_row_min = x->mv_row_min;
+    int tmp_row_max = x->mv_row_max;
+    int id = ite % 2;
+
+    // Initialized here because of compiler problem in Visual Studio.
+    ref_yv12[0] = xd->plane[0].pre[0];
+    ref_yv12[1] = xd->plane[0].pre[1];
+
+    // Get pred block from second frame.
+    vp9_build_inter_predictor(ref_yv12[!id].buf,
+                              ref_yv12[!id].stride,
+                              second_pred, pw,
+                              &frame_mv[refs[!id]],
+                              &xd->scale_factor[!id],
+                              pw, ph, 0,
+                              &xd->subpix);
+
+    // Compound motion search on first ref frame.
+    if (id)
+      xd->plane[0].pre[0] = ref_yv12[id];
+    vp9_clamp_mv_min_max(x, &ref_mv[id]);
+
+    // Use mv result from single mode as mvp.
+    tmp_mv.as_int = frame_mv[refs[id]].as_int;
+
+    tmp_mv.as_mv.col >>= 3;
+    tmp_mv.as_mv.row >>= 3;
+
+    // Small-range full-pixel motion search
+    bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
+                                       search_range,
+                                       &cpi->fn_ptr[block_size],
+                                       x->nmvjointcost, x->mvcost,
+                                       &ref_mv[id], second_pred,
+                                       pw, ph);
+
+    x->mv_col_min = tmp_col_min;
+    x->mv_col_max = tmp_col_max;
+    x->mv_row_min = tmp_row_min;
+    x->mv_row_max = tmp_row_max;
+
+    if (bestsme < INT_MAX) {
+      int dis; /* TODO: use dis in distortion calculation later. */
+      unsigned int sse;
+
+      bestsme = vp9_find_best_sub_pixel_comp(x, &tmp_mv,
+                                             &ref_mv[id],
+                                             x->errorperbit,
+                                             &cpi->fn_ptr[block_size],
+                                             x->nmvjointcost, x->mvcost,
+                                             &dis, &sse, second_pred,
+                                             pw, ph);
+    }
+
+    if (id)
+      xd->plane[0].pre[0] = scaled_first_yv12;
+
+    if (bestsme < last_besterr[id]) {
+      frame_mv[refs[id]].as_int = tmp_mv.as_int;
+      last_besterr[id] = bestsme;
+    } else {
+      break;
+    }
+  }
+
+  // restore the predictor
+  if (scaled_ref_frame[0]) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = backup_yv12[i];
+  }
+
+  if (scaled_ref_frame[1]) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[1] = backup_second_yv12[i];
+  }
+
+  vpx_free(second_pred);
+}
+
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                                 enum BlockSize block_size,
-                                 int *saddone, int near_sadidx[],
-                                 int mdcounts[4], int64_t txfm_cache[],
+                                 BLOCK_SIZE_TYPE bsize,
+                                 int64_t txfm_cache[],
                                  int *rate2, int *distortion, int *skippable,
-                                 int *compmode_cost,
-#if CONFIG_COMP_INTERINTRA_PRED
-                                 int *compmode_interintra_cost,
-#endif
                                  int *rate_y, int *distortion_y,
                                  int *rate_uv, int *distortion_uv,
                                  int *mode_excluded, int *disable_skip,
-                                 int mode_index,
                                  INTERPOLATIONFILTERTYPE *best_filter,
-                                 int_mv frame_mv[MB_MODE_COUNT]
-                                                [MAX_REF_FRAMES],
-                                 YV12_BUFFER_CONFIG *scaled_ref_frame,
-                                 int mb_row, int mb_col) {
+                                 int_mv *frame_mv,
+                                 YV12_BUFFER_CONFIG **scaled_ref_frame,
+                                 int mi_row, int mi_col,
+                                 int_mv single_newmv[MAX_REF_FRAMES]) {
+  const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
+
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
+  const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
+  const enum BlockSize uv_block_size = get_plane_block_size(bsize,
+                                                            &xd->plane[1]);
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  BLOCK *b = &x->block[0];
-  BLOCKD *d = &xd->block[0];
-  const int is_comp_pred = (mbmi->second_ref_frame > 0);
-#if CONFIG_COMP_INTERINTRA_PRED
-  const int is_comp_interintra_pred = (mbmi->second_ref_frame == INTRA_FRAME);
-#endif
+  const int is_comp_pred = (mbmi->ref_frame[1] > 0);
   const int num_refs = is_comp_pred ? 2 : 1;
   const int this_mode = mbmi->mode;
   int i;
-  int refs[2] = { mbmi->ref_frame,
-                  (mbmi->second_ref_frame < 0 ? 0 : mbmi->second_ref_frame) };
+  int refs[2] = { mbmi->ref_frame[0],
+                  (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   int_mv cur_mv[2];
   int_mv ref_mv[2];
   int64_t this_rd = 0;
-  unsigned char tmp_ybuf[64 * 64];
-  unsigned char tmp_ubuf[32 * 32];
-  unsigned char tmp_vbuf[32 * 32];
+  unsigned char tmp_buf[MAX_MB_PLANE][64 * 64];
   int pred_exists = 0;
   int interpolating_intpel_seen = 0;
   int intpel_mv;
@@ -3564,19 +2070,27 @@
       ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
 
       if (is_comp_pred) {
-        if (frame_mv[NEWMV][refs[0]].as_int == INVALID_MV ||
-            frame_mv[NEWMV][refs[1]].as_int == INVALID_MV)
+        // Initialize mv using single prediction mode result.
+        frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+        frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+        if (cpi->sf.comp_inter_joint_search_thresh < bsize)
+          iterative_motion_search(cpi, x, bsize, frame_mv, scaled_ref_frame,
+                                  mi_row, mi_col, single_newmv);
+
+        if (frame_mv[refs[0]].as_int == INVALID_MV ||
+            frame_mv[refs[1]].as_int == INVALID_MV)
           return INT64_MAX;
-        *rate2 += vp9_mv_bit_cost(&frame_mv[NEWMV][refs[0]],
+        *rate2 += vp9_mv_bit_cost(&frame_mv[refs[0]],
                                   &ref_mv[0],
                                   x->nmvjointcost, x->mvcost, 96,
                                   x->e_mbd.allow_high_precision_mv);
-        *rate2 += vp9_mv_bit_cost(&frame_mv[NEWMV][refs[1]],
+        *rate2 += vp9_mv_bit_cost(&frame_mv[refs[1]],
                                   &ref_mv[1],
                                   x->nmvjointcost, x->mvcost, 96,
                                   x->e_mbd.allow_high_precision_mv);
       } else {
-        YV12_BUFFER_CONFIG backup_yv12 = xd->pre;
+        struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
         int bestsme = INT_MAX;
         int further_steps, step_param = cpi->sf.first_step;
         int sadpb = x->sadperbit16;
@@ -3588,14 +2102,17 @@
         int tmp_row_min = x->mv_row_min;
         int tmp_row_max = x->mv_row_max;
 
-        if (scaled_ref_frame) {
+        if (scaled_ref_frame[0]) {
+          int i;
+
           // Swap out the reference frame for a version that's been scaled to
           // match the resolution of the current frame, allowing the existing
           // motion search code to be used without additional modifications.
-          xd->pre = *scaled_ref_frame;
-          xd->pre.y_buffer += mb_row * 16 * xd->pre.y_stride + mb_col * 16;
-          xd->pre.u_buffer += mb_row * 8 * xd->pre.uv_stride + mb_col * 8;
-          xd->pre.v_buffer += mb_row * 8 * xd->pre.uv_stride + mb_col * 8;
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            backup_yv12[i] = xd->plane[i].pre[0];
+
+          setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
+                           NULL, NULL);
         }
 
         vp9_clamp_mv_min_max(x, &ref_mv[0]);
@@ -3615,7 +2132,7 @@
         // Further step/diamond searches as necessary
         further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
 
-        bestsme = vp9_full_pixel_diamond(cpi, x, b, d, &mvp_full, step_param,
+        bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
                                          sadpb, further_steps, 1,
                                          &cpi->fn_ptr[block_size],
                                          &ref_mv[0], &tmp_mv);
@@ -3628,7 +2145,7 @@
         if (bestsme < INT_MAX) {
           int dis; /* TODO: use dis in distortion calculation later. */
           unsigned int sse;
-          cpi->find_fractional_mv_step(x, b, d, &tmp_mv,
+          cpi->find_fractional_mv_step(x, &tmp_mv,
                                        &ref_mv[0],
                                        x->errorperbit,
                                        &cpi->fn_ptr[block_size],
@@ -3635,8 +2152,8 @@
                                        x->nmvjointcost, x->mvcost,
                                        &dis, &sse);
         }
-        d->bmi.as_mv[0].as_int = tmp_mv.as_int;
-        frame_mv[NEWMV][refs[0]].as_int = d->bmi.as_mv[0].as_int;
+        frame_mv[refs[0]].as_int = tmp_mv.as_int;
+        single_newmv[refs[0]].as_int = tmp_mv.as_int;
 
         // Add the new motion vector cost to our rolling cost variable
         *rate2 += vp9_mv_bit_cost(&tmp_mv, &ref_mv[0],
@@ -3644,8 +2161,11 @@
                                   96, xd->allow_high_precision_mv);
 
         // restore the predictor, if required
-        if (scaled_ref_frame) {
-          xd->pre = backup_yv12;
+        if (scaled_ref_frame[0]) {
+          int i;
+
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            xd->plane[i].pre[0] = backup_yv12[i];
         }
       }
       break;
@@ -3656,9 +2176,13 @@
       break;
   }
   for (i = 0; i < num_refs; ++i) {
-    cur_mv[i] = frame_mv[this_mode][refs[i]];
+    cur_mv[i] = frame_mv[refs[i]];
     // Clip "next_nearest" so that it does not extend to far out of image
-    clamp_mv2(&cur_mv[i], xd);
+    if (this_mode == NEWMV)
+      assert(!clamp_mv2(&cur_mv[i], xd));
+    else
+      clamp_mv2(&cur_mv[i], xd);
+
     if (mv_check_bounds(x, &cur_mv[i]))
       return INT64_MAX;
     mbmi->mv[i].as_int = cur_mv[i].as_int;
@@ -3669,24 +2193,8 @@
    * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
    * words if you present them in that order, the second one is always known
    * if the first is known */
-  *compmode_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP),
-                                is_comp_pred);
   *rate2 += vp9_cost_mv_ref(cpi, this_mode,
-                            mbmi->mb_mode_context[mbmi->ref_frame]);
-#if CONFIG_COMP_INTERINTRA_PRED
-  if (!is_comp_pred) {
-    *compmode_interintra_cost = vp9_cost_bit(cm->fc.interintra_prob,
-                                             is_comp_interintra_pred);
-    if (is_comp_interintra_pred) {
-      *compmode_interintra_cost +=
-          x->mbmode_cost[xd->frame_type][mbmi->interintra_mode];
-#if SEPARATE_INTERINTRA_UV
-      *compmode_interintra_cost +=
-          x->intra_uv_mode_cost[xd->frame_type][mbmi->interintra_uv_mode];
-#endif
-    }
-  }
-#endif
+                            mbmi->mb_mode_context[mbmi->ref_frame[0]]);
 
   pred_exists = 0;
   interpolating_intpel_seen = 0;
@@ -3698,342 +2206,106 @@
                  (mbmi->mv[1].as_mv.col & 15) == 0;
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
-  if (block_size == BLOCK_64X64) {
-    int switchable_filter_index, newbest;
-    int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0;
-    int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0;
-    for (switchable_filter_index = 0;
-         switchable_filter_index < VP9_SWITCHABLE_FILTERS;
-         ++switchable_filter_index) {
+  if (cpi->speed > 4) {
+    *best_filter = EIGHTTAP;
+  } else {
+    int i, newbest;
+    int tmp_rate_sum = 0, tmp_dist_sum = 0;
+    for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
       int rs = 0;
-      mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index];
-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+      const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i];
+      const int is_intpel_interp = intpel_mv &&
+                                   vp9_is_interpolating_filter[filter];
+      mbmi->interp_filter = filter;
+      vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
 
-      if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-        const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
-        const int m = vp9_switchable_interp_map[mbmi->interp_filter];
-        rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
-      }
-      if (interpolating_intpel_seen && intpel_mv &&
-          vp9_is_interpolating_filter[mbmi->interp_filter]) {
-        rd = RDCOST(x->rdmult, x->rddiv,
-                    rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i,
-                    tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i);
+      if (cm->mcomp_filter_type == SWITCHABLE)
+        rs = get_switchable_rate(cm, x);
+
+      if (interpolating_intpel_seen && is_intpel_interp) {
+        rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_sum, tmp_dist_sum);
       } else {
-        unsigned int sse, var;
-        int tmp_rate_y, tmp_rate_u, tmp_rate_v;
-        int tmp_dist_y, tmp_dist_u, tmp_dist_v;
-        vp9_build_inter64x64_predictors_sb(xd,
-                                           xd->dst.y_buffer,
-                                           xd->dst.u_buffer,
-                                           xd->dst.v_buffer,
-                                           xd->dst.y_stride,
-                                           xd->dst.uv_stride,
-                                           mb_row, mb_col);
-        var = vp9_variance64x64(*(b->base_src), b->src_stride,
-                                xd->dst.y_buffer, xd->dst.y_stride, &sse);
-        // Note our transform coeffs are 8 times an orthogonal transform.
-        // Hence quantizer step is also 8 times. To get effective quantizer
-        // we need to divide by 8 before sending to modeling function.
-        model_rd_from_var_lapndz(var, 64 * 64, xd->block[0].dequant[1] >> 3,
-                                 &tmp_rate_y, &tmp_dist_y);
-        var = vp9_variance32x32(x->src.u_buffer, x->src.uv_stride,
-                                xd->dst.u_buffer, xd->dst.uv_stride, &sse);
-        model_rd_from_var_lapndz(var, 32 * 32, xd->block[16].dequant[1] >> 3,
-                                 &tmp_rate_u, &tmp_dist_u);
-        var = vp9_variance32x32(x->src.v_buffer, x->src.uv_stride,
-                                xd->dst.v_buffer, xd->dst.uv_stride, &sse);
-        model_rd_from_var_lapndz(var, 32 * 32, xd->block[20].dequant[1] >> 3,
-                                 &tmp_rate_v, &tmp_dist_v);
-        rd = RDCOST(x->rdmult, x->rddiv,
-                    rs + tmp_rate_y + tmp_rate_u + tmp_rate_v,
-                    tmp_dist_y + tmp_dist_u + tmp_dist_v);
-        if (!interpolating_intpel_seen && intpel_mv &&
-            vp9_is_interpolating_filter[mbmi->interp_filter]) {
-          tmp_rate_y_i = tmp_rate_y;
-          tmp_rate_u_i = tmp_rate_u;
-          tmp_rate_v_i = tmp_rate_v;
-          tmp_dist_y_i = tmp_dist_y;
-          tmp_dist_u_i = tmp_dist_u;
-          tmp_dist_v_i = tmp_dist_v;
+        int rate_sum = 0, dist_sum = 0;
+        vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+        model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
+        rd = RDCOST(x->rdmult, x->rddiv, rs + rate_sum, dist_sum);
+        if (!interpolating_intpel_seen && is_intpel_interp) {
+          tmp_rate_sum = rate_sum;
+          tmp_dist_sum = dist_sum;
         }
       }
-      newbest = (switchable_filter_index == 0 || rd < best_rd);
+      newbest = i == 0 || rd < best_rd;
+
       if (newbest) {
         best_rd = rd;
         *best_filter = mbmi->interp_filter;
       }
+
       if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
           (cm->mcomp_filter_type != SWITCHABLE &&
            cm->mcomp_filter_type == mbmi->interp_filter)) {
-        int i;
-        for (i = 0; i < 64; ++i)
-          vpx_memcpy(tmp_ybuf + i * 64,
-                     xd->dst.y_buffer + i * xd->dst.y_stride,
-                     sizeof(unsigned char) * 64);
-        for (i = 0; i < 32; ++i)
-          vpx_memcpy(tmp_ubuf + i * 32,
-                     xd->dst.u_buffer + i * xd->dst.uv_stride,
-                     sizeof(unsigned char) * 32);
-        for (i = 0; i < 32; ++i)
-          vpx_memcpy(tmp_vbuf + i * 32,
-                     xd->dst.v_buffer + i * xd->dst.uv_stride,
-                     sizeof(unsigned char) * 32);
-        pred_exists = 1;
-      }
-      interpolating_intpel_seen |=
-        intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter];
-    }
-  } else if (block_size == BLOCK_32X32) {
-    int switchable_filter_index, newbest;
-    int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0;
-    int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0;
-    for (switchable_filter_index = 0;
-       switchable_filter_index < VP9_SWITCHABLE_FILTERS;
-       ++switchable_filter_index) {
-      int rs = 0;
-      mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index];
-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-      if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-        const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
-        const int m = vp9_switchable_interp_map[mbmi->interp_filter];
-        rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
-      }
-      if (interpolating_intpel_seen && intpel_mv &&
-          vp9_is_interpolating_filter[mbmi->interp_filter]) {
-        rd = RDCOST(x->rdmult, x->rddiv,
-                    rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i,
-                    tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i);
-      } else {
-        unsigned int sse, var;
-        int tmp_rate_y, tmp_rate_u, tmp_rate_v;
-        int tmp_dist_y, tmp_dist_u, tmp_dist_v;
-        vp9_build_inter32x32_predictors_sb(xd,
-                                           xd->dst.y_buffer,
-                                           xd->dst.u_buffer,
-                                           xd->dst.v_buffer,
-                                           xd->dst.y_stride,
-                                           xd->dst.uv_stride,
-                                           mb_row, mb_col);
-        var = vp9_variance32x32(*(b->base_src), b->src_stride,
-                                xd->dst.y_buffer, xd->dst.y_stride, &sse);
-        // Note our transform coeffs are 8 times an orthogonal transform.
-        // Hence quantizer step is also 8 times. To get effective quantizer
-        // we need to divide by 8 before sending to modeling function.
-        model_rd_from_var_lapndz(var, 32 * 32, xd->block[0].dequant[1] >> 3,
-                                 &tmp_rate_y, &tmp_dist_y);
-        var = vp9_variance16x16(x->src.u_buffer, x->src.uv_stride,
-                                xd->dst.u_buffer, xd->dst.uv_stride, &sse);
-        model_rd_from_var_lapndz(var, 16 * 16, xd->block[16].dequant[1] >> 3,
-                                 &tmp_rate_u, &tmp_dist_u);
-        var = vp9_variance16x16(x->src.v_buffer, x->src.uv_stride,
-                                xd->dst.v_buffer, xd->dst.uv_stride, &sse);
-        model_rd_from_var_lapndz(var, 16 * 16, xd->block[20].dequant[1] >> 3,
-                                 &tmp_rate_v, &tmp_dist_v);
-        rd = RDCOST(x->rdmult, x->rddiv,
-                    rs + tmp_rate_y + tmp_rate_u + tmp_rate_v,
-                    tmp_dist_y + tmp_dist_u + tmp_dist_v);
-        if (!interpolating_intpel_seen && intpel_mv &&
-            vp9_is_interpolating_filter[mbmi->interp_filter]) {
-          tmp_rate_y_i = tmp_rate_y;
-          tmp_rate_u_i = tmp_rate_u;
-          tmp_rate_v_i = tmp_rate_v;
-          tmp_dist_y_i = tmp_dist_y;
-          tmp_dist_u_i = tmp_dist_u;
-          tmp_dist_v_i = tmp_dist_v;
+        int p;
+
+        for (p = 0; p < MAX_MB_PLANE; p++) {
+          const int y = (MI_SIZE * bh) >> xd->plane[p].subsampling_y;
+          const int x = (MI_SIZE * bw) >> xd->plane[p].subsampling_x;
+          int i;
+
+          for (i = 0; i < y; i++)
+            vpx_memcpy(&tmp_buf[p][64 * i],
+                       xd->plane[p].dst.buf + i * xd->plane[p].dst.stride, x);
         }
-      }
-      newbest = (switchable_filter_index == 0 || rd < best_rd);
-      if (newbest) {
-        best_rd = rd;
-        *best_filter = mbmi->interp_filter;
-      }
-      if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
-          (cm->mcomp_filter_type != SWITCHABLE &&
-           cm->mcomp_filter_type == mbmi->interp_filter)) {
-        int i;
-        for (i = 0; i < 32; ++i)
-          vpx_memcpy(tmp_ybuf + i * 64,
-                     xd->dst.y_buffer + i * xd->dst.y_stride,
-                     sizeof(unsigned char) * 32);
-        for (i = 0; i < 16; ++i)
-          vpx_memcpy(tmp_ubuf + i * 32,
-                     xd->dst.u_buffer + i * xd->dst.uv_stride,
-                     sizeof(unsigned char) * 16);
-        for (i = 0; i < 16; ++i)
-          vpx_memcpy(tmp_vbuf + i * 32,
-                     xd->dst.v_buffer + i * xd->dst.uv_stride,
-                     sizeof(unsigned char) * 16);
         pred_exists = 1;
       }
-      interpolating_intpel_seen |=
-        intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter];
+      interpolating_intpel_seen |= is_intpel_interp;
     }
-  } else {
-    int switchable_filter_index, newbest;
-    int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0;
-    int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0;
-    assert(block_size == BLOCK_16X16);
-    for (switchable_filter_index = 0;
-       switchable_filter_index < VP9_SWITCHABLE_FILTERS;
-       ++switchable_filter_index) {
-      int rs = 0;
-      mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index];
-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-      if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-        const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
-        const int m = vp9_switchable_interp_map[mbmi->interp_filter];
-        rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
-      }
-      if (interpolating_intpel_seen && intpel_mv &&
-          vp9_is_interpolating_filter[mbmi->interp_filter]) {
-        rd = RDCOST(x->rdmult, x->rddiv,
-                    rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i,
-                    tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i);
-      } else {
-        unsigned int sse, var;
-        int tmp_rate_y, tmp_rate_u, tmp_rate_v;
-        int tmp_dist_y, tmp_dist_u, tmp_dist_v;
-        vp9_build_inter16x16_predictors_mb(xd, xd->predictor,
-                                           xd->predictor + 256,
-                                           xd->predictor + 320,
-                                           16, 8, mb_row, mb_col);
-        var = vp9_variance16x16(*(b->base_src), b->src_stride,
-                                xd->predictor, 16, &sse);
-        // Note our transform coeffs are 8 times an orthogonal transform.
-        // Hence quantizer step is also 8 times. To get effective quantizer
-        // we need to divide by 8 before sending to modeling function.
-        model_rd_from_var_lapndz(var, 16 * 16, xd->block[0].dequant[1] >> 3,
-                                 &tmp_rate_y, &tmp_dist_y);
-        var = vp9_variance8x8(x->src.u_buffer, x->src.uv_stride,
-                              &xd->predictor[256], 8, &sse);
-        model_rd_from_var_lapndz(var, 8 * 8, xd->block[16].dequant[1] >> 3,
-                                 &tmp_rate_u, &tmp_dist_u);
-        var = vp9_variance8x8(x->src.v_buffer, x->src.uv_stride,
-                              &xd->predictor[320], 8, &sse);
-        model_rd_from_var_lapndz(var, 8 * 8, xd->block[20].dequant[1] >> 3,
-                                 &tmp_rate_v, &tmp_dist_v);
-        rd = RDCOST(x->rdmult, x->rddiv,
-                    rs + tmp_rate_y + tmp_rate_u + tmp_rate_v,
-                    tmp_dist_y + tmp_dist_u + tmp_dist_v);
-        if (!interpolating_intpel_seen && intpel_mv &&
-            vp9_is_interpolating_filter[mbmi->interp_filter]) {
-          tmp_rate_y_i = tmp_rate_y;
-          tmp_rate_u_i = tmp_rate_u;
-          tmp_rate_v_i = tmp_rate_v;
-          tmp_dist_y_i = tmp_dist_y;
-          tmp_dist_u_i = tmp_dist_u;
-          tmp_dist_v_i = tmp_dist_v;
-        }
-      }
-      newbest = (switchable_filter_index == 0 || rd < best_rd);
-      if (newbest) {
-        best_rd = rd;
-        *best_filter = mbmi->interp_filter;
-      }
-      if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
-          (cm->mcomp_filter_type != SWITCHABLE &&
-           cm->mcomp_filter_type == mbmi->interp_filter)) {
-        vpx_memcpy(tmp_ybuf, xd->predictor, sizeof(unsigned char) * 256);
-        vpx_memcpy(tmp_ubuf, xd->predictor + 256, sizeof(unsigned char) * 64);
-        vpx_memcpy(tmp_vbuf, xd->predictor + 320, sizeof(unsigned char) * 64);
-        pred_exists = 1;
-      }
-      interpolating_intpel_seen |=
-        intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter];
-    }
   }
 
   // Set the appripriate filter
-  if (cm->mcomp_filter_type != SWITCHABLE)
-    mbmi->interp_filter = cm->mcomp_filter_type;
-  else
-    mbmi->interp_filter = *best_filter;
-  vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+  mbmi->interp_filter = cm->mcomp_filter_type != SWITCHABLE ?
+                             cm->mcomp_filter_type : *best_filter;
+  vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
 
+
   if (pred_exists) {
-    if (block_size == BLOCK_64X64) {
-      for (i = 0; i < 64; ++i)
-        vpx_memcpy(xd->dst.y_buffer + i * xd->dst.y_stride,  tmp_ybuf + i * 64,
-                   sizeof(unsigned char) * 64);
-      for (i = 0; i < 32; ++i)
-        vpx_memcpy(xd->dst.u_buffer + i * xd->dst.uv_stride, tmp_ubuf + i * 32,
-                   sizeof(unsigned char) * 32);
-      for (i = 0; i < 32; ++i)
-        vpx_memcpy(xd->dst.v_buffer + i * xd->dst.uv_stride, tmp_vbuf + i * 32,
-                   sizeof(unsigned char) * 32);
-    } else if (block_size == BLOCK_32X32) {
-      for (i = 0; i < 32; ++i)
-        vpx_memcpy(xd->dst.y_buffer + i * xd->dst.y_stride,  tmp_ybuf + i * 64,
-                   sizeof(unsigned char) * 32);
-      for (i = 0; i < 16; ++i)
-        vpx_memcpy(xd->dst.u_buffer + i * xd->dst.uv_stride, tmp_ubuf + i * 32,
-                   sizeof(unsigned char) * 16);
-      for (i = 0; i < 16; ++i)
-        vpx_memcpy(xd->dst.v_buffer + i * xd->dst.uv_stride, tmp_vbuf + i * 32,
-                   sizeof(unsigned char) * 16);
-    } else {
-      vpx_memcpy(xd->predictor, tmp_ybuf, sizeof(unsigned char) * 256);
-      vpx_memcpy(xd->predictor + 256, tmp_ubuf, sizeof(unsigned char) * 64);
-      vpx_memcpy(xd->predictor + 320, tmp_vbuf, sizeof(unsigned char) * 64);
+    int p;
+
+    for (p = 0; p < MAX_MB_PLANE; p++) {
+      const int y = (MI_SIZE * bh) >> xd->plane[p].subsampling_y;
+      const int x = (MI_SIZE * bw) >> xd->plane[p].subsampling_x;
+      int i;
+
+      for (i = 0; i < y; i++)
+        vpx_memcpy(xd->plane[p].dst.buf + i * xd->plane[p].dst.stride,
+                   &tmp_buf[p][64 * i], x);
     }
   } else {
     // Handles the special case when a filter that is not in the
     // switchable list (ex. bilinear, 6-tap) is indicated at the frame level
-    if (block_size == BLOCK_64X64) {
-      vp9_build_inter64x64_predictors_sb(xd,
-                                         xd->dst.y_buffer,
-                                         xd->dst.u_buffer,
-                                         xd->dst.v_buffer,
-                                         xd->dst.y_stride,
-                                         xd->dst.uv_stride,
-                                         mb_row, mb_col);
-    } else if (block_size == BLOCK_32X32) {
-      vp9_build_inter32x32_predictors_sb(xd,
-                                         xd->dst.y_buffer,
-                                         xd->dst.u_buffer,
-                                         xd->dst.v_buffer,
-                                         xd->dst.y_stride,
-                                         xd->dst.uv_stride,
-                                         mb_row, mb_col);
-    } else {
-      vp9_build_inter16x16_predictors_mb(xd, xd->predictor,
-                                         xd->predictor + 256,
-                                         xd->predictor + 320,
-                                         16, 8, mb_row, mb_col);
-    }
+    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
   }
 
-  if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-    const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
-    const int m = vp9_switchable_interp_map[mbmi->interp_filter];
-    *rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
-  }
+  if (cpi->common.mcomp_filter_type == SWITCHABLE)
+    *rate2 += get_switchable_rate(cm, x);
 
   if (cpi->active_map_enabled && x->active_ptr[0] == 0)
     x->skip = 1;
   else if (x->encode_breakout) {
     unsigned int var, sse;
-    int threshold = (xd->block[0].dequant[1]
-                     * xd->block[0].dequant[1] >> 4);
+    int threshold = (xd->plane[0].dequant[1]
+                     * xd->plane[0].dequant[1] >> 4);
 
     if (threshold < x->encode_breakout)
       threshold = x->encode_breakout;
 
-    if (block_size == BLOCK_64X64) {
-      var = vp9_variance64x64(*(b->base_src), b->src_stride,
-                              xd->dst.y_buffer, xd->dst.y_stride, &sse);
-    } else if (block_size == BLOCK_32X32) {
-      var = vp9_variance32x32(*(b->base_src), b->src_stride,
-                              xd->dst.y_buffer, xd->dst.y_stride, &sse);
-    } else {
-      assert(block_size == BLOCK_16X16);
-      var = vp9_variance16x16(*(b->base_src), b->src_stride,
-                              xd->predictor, 16, &sse);
-    }
+    var = cpi->fn_ptr[block_size].vf(x->plane[0].src.buf,
+                                     x->plane[0].src.stride,
+                                     xd->plane[0].dst.buf,
+                                     xd->plane[0].dst.stride,
+                                     &sse);
 
     if ((int)sse < threshold) {
-      unsigned int q2dc = xd->block[0].dequant[0];
+      unsigned int q2dc = xd->plane[0].dequant[0];
       /* If there is no codeable 2nd order dc
          or a very small uniform pixel change change */
       if ((sse - var < q2dc * q2dc >> 4) ||
@@ -4040,26 +2312,17 @@
           (sse / 2 > var && sse - var < 64)) {
         // Check u and v to make sure skip is ok
         int sse2;
+        unsigned int sse2u, sse2v;
+        var = cpi->fn_ptr[uv_block_size].vf(x->plane[1].src.buf,
+                                            x->plane[1].src.stride,
+                                            xd->plane[1].dst.buf,
+                                            xd->plane[1].dst.stride, &sse2u);
+        var = cpi->fn_ptr[uv_block_size].vf(x->plane[2].src.buf,
+                                            x->plane[1].src.stride,
+                                            xd->plane[2].dst.buf,
+                                            xd->plane[1].dst.stride, &sse2v);
+        sse2 = sse2u + sse2v;
 
-        if (block_size == BLOCK_64X64) {
-          unsigned int sse2u, sse2v;
-          var = vp9_variance32x32(x->src.u_buffer, x->src.uv_stride,
-                                  xd->dst.u_buffer, xd->dst.uv_stride, &sse2u);
-          var = vp9_variance32x32(x->src.v_buffer, x->src.uv_stride,
-                                  xd->dst.v_buffer, xd->dst.uv_stride, &sse2v);
-          sse2 = sse2u + sse2v;
-        } else if (block_size == BLOCK_32X32) {
-          unsigned int sse2u, sse2v;
-          var = vp9_variance16x16(x->src.u_buffer, x->src.uv_stride,
-                                  xd->dst.u_buffer, xd->dst.uv_stride, &sse2u);
-          var = vp9_variance16x16(x->src.v_buffer, x->src.uv_stride,
-                                  xd->dst.v_buffer, xd->dst.uv_stride, &sse2v);
-          sse2 = sse2u + sse2v;
-        } else {
-          assert(block_size == BLOCK_16X16);
-          sse2 = vp9_uvsse(x);
-        }
-
         if (sse2 * 2 < threshold) {
           x->skip = 1;
           *distortion = sse + sse2;
@@ -4077,42 +2340,21 @@
   }
 
   if (!x->skip) {
-    if (block_size == BLOCK_64X64) {
-      int skippable_y, skippable_uv;
+    int skippable_y, skippable_uv;
 
-      // Y cost and distortion
-      super_block_64_yrd(cpi, x, rate_y, distortion_y,
-                         &skippable_y, txfm_cache);
-      *rate2 += *rate_y;
-      *distortion += *distortion_y;
+    // Y cost and distortion
+    super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y,
+                    bsize, txfm_cache);
 
-      rd_inter64x64_uv(cpi, x, rate_uv, distortion_uv,
-                       cm->full_pixel, &skippable_uv);
+    *rate2 += *rate_y;
+    *distortion += *distortion_y;
 
-      *rate2 += *rate_uv;
-      *distortion += *distortion_uv;
-      *skippable = skippable_y && skippable_uv;
-    } else if (block_size == BLOCK_32X32) {
-      int skippable_y, skippable_uv;
+    super_block_uvrd(cm, x, rate_uv, distortion_uv,
+                     &skippable_uv, bsize);
 
-      // Y cost and distortion
-      super_block_yrd(cpi, x, rate_y, distortion_y,
-                      &skippable_y, txfm_cache);
-      *rate2 += *rate_y;
-      *distortion += *distortion_y;
-
-      rd_inter32x32_uv(cpi, x, rate_uv, distortion_uv,
-                       cm->full_pixel, &skippable_uv);
-
-      *rate2 += *rate_uv;
-      *distortion += *distortion_uv;
-      *skippable = skippable_y && skippable_uv;
-    } else {
-      assert(block_size == BLOCK_16X16);
-      inter_mode_cost(cpi, x, rate2, distortion,
-                      rate_y, distortion_y, rate_uv, distortion_uv,
-                      skippable, txfm_cache);
-    }
+    *rate2 += *rate_uv;
+    *distortion += *distortion_uv;
+    *skippable = skippable_y && skippable_uv;
   }
 
   if (!(*mode_excluded)) {
@@ -4121,1065 +2363,82 @@
     } else {
       *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
     }
-#if CONFIG_COMP_INTERINTRA_PRED
-    if (is_comp_interintra_pred && !cm->use_interintra) *mode_excluded = 1;
-#endif
   }
 
   return this_rd;  // if 0, this will be re-calculated by caller
 }
 
-static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                               int mb_row, int mb_col,
-                               int *returnrate, int *returndistortion,
-                               int64_t *returnintra) {
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-    VP9_ALT_FLAG };
+void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+                               int *returnrate, int *returndist,
+                               BLOCK_SIZE_TYPE bsize,
+                               PICK_MODE_CONTEXT *ctx) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  union b_mode_info best_bmodes[16];
-  MB_MODE_INFO best_mbmode;
-  PARTITION_INFO best_partition;
-  int_mv best_ref_mv, second_best_ref_mv;
-  MB_PREDICTION_MODE this_mode;
-  MB_PREDICTION_MODE best_mode = DC_PRED;
-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-  int i, best_mode_index = 0;
-  int mode8x8[4];
-  unsigned char segment_id = mbmi->segment_id;
-
-  int mode_index;
-  int mdcounts[4];
-  int rate, distortion;
-  int rate2, distortion2;
-  int64_t best_txfm_rd[NB_TXFM_MODES];
-  int64_t best_txfm_diff[NB_TXFM_MODES];
-  int64_t best_pred_diff[NB_PREDICTION_TYPES];
-  int64_t best_pred_rd[NB_PREDICTION_TYPES];
-  int64_t best_rd = INT64_MAX, best_intra_rd = INT64_MAX;
-#if CONFIG_COMP_INTERINTRA_PRED
-  int is_best_interintra = 0;
-  int64_t best_intra16_rd = INT64_MAX;
-  int best_intra16_mode = DC_PRED;
-#if SEPARATE_INTERINTRA_UV
-  int best_intra16_uv_mode = DC_PRED;
-#endif
-#endif
-  int64_t best_overall_rd = INT64_MAX;
-  INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
-  INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
-  int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;
-  int uv_intra_skippable = 0;
-  int uv_intra_rate_8x8 = 0, uv_intra_distortion_8x8 = 0, uv_intra_rate_tokenonly_8x8 = 0;
-  int uv_intra_skippable_8x8 = 0;
-  int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);
-  int distortion_uv = INT_MAX;
-  int64_t best_yrd = INT64_MAX;
-
-  MB_PREDICTION_MODE uv_intra_mode;
-  MB_PREDICTION_MODE uv_intra_mode_8x8 = 0;
-
-  int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  int saddone = 0;
-
-  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  int frame_mdcounts[4][4];
-  YV12_BUFFER_CONFIG yv12_mb[4];
-
-  unsigned int ref_costs[MAX_REF_FRAMES];
-  int_mv seg_mvs[NB_PARTITIONINGS][16 /* n_blocks */][MAX_REF_FRAMES - 1];
-
-  int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
-                                             cpi->common.y1dc_delta_q);
-
-  struct scale_factors scale_factor[4];
-
-  vpx_memset(mode8x8, 0, sizeof(mode8x8));
-  vpx_memset(&frame_mv, 0, sizeof(frame_mv));
-  vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
-  vpx_memset(&best_bmodes, 0, sizeof(best_bmodes));
-  vpx_memset(&x->mb_context[xd->sb_index][xd->mb_index], 0,
-             sizeof(PICK_MODE_CONTEXT));
-
-  for (i = 0; i < MAX_REF_FRAMES; i++)
-    frame_mv[NEWMV][i].as_int = INVALID_MV;
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
-    best_pred_rd[i] = INT64_MAX;
-  for (i = 0; i < NB_TXFM_MODES; i++)
-    best_txfm_rd[i] = INT64_MAX;
-
-  for (i = 0; i < NB_PARTITIONINGS; i++) {
-    int j, k;
-
-    for (j = 0; j < 16; j++)
-      for (k = 0; k < MAX_REF_FRAMES - 1; k++)
-        seg_mvs[i][j][k].as_int = INVALID_MV;
-  }
-
-  if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->lst_fb_idx,
-                       LAST_FRAME, BLOCK_16X16, mb_row, mb_col,
-                       frame_mv[NEARESTMV], frame_mv[NEARMV],
-                       frame_mdcounts, yv12_mb, scale_factor);
-  }
-
-  if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->gld_fb_idx,
-                       GOLDEN_FRAME, BLOCK_16X16, mb_row, mb_col,
-                       frame_mv[NEARESTMV], frame_mv[NEARMV],
-                       frame_mdcounts, yv12_mb, scale_factor);
-  }
-
-  if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->alt_fb_idx,
-                       ALTREF_FRAME, BLOCK_16X16, mb_row, mb_col,
-                       frame_mv[NEARESTMV], frame_mv[NEARMV],
-                       frame_mdcounts, yv12_mb, scale_factor);
-  }
-
-  *returnintra = INT64_MAX;
-
-  mbmi->ref_frame = INTRA_FRAME;
-
-  /* Initialize zbin mode boost for uv costing */
-  cpi->zbin_mode_boost = 0;
-  vp9_update_zbin_extra(cpi, x);
-
-  xd->mode_info_context->mbmi.mode = DC_PRED;
-
-  rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate,
-                          &uv_intra_rate_tokenonly, &uv_intra_distortion,
-                          &uv_intra_skippable);
-  uv_intra_mode = mbmi->uv_mode;
-
-  /* rough estimate for now */
-  if (cpi->common.txfm_mode != ONLY_4X4) {
-    rd_pick_intra_mbuv_mode_8x8(cpi, x, &uv_intra_rate_8x8,
-                                &uv_intra_rate_tokenonly_8x8,
-                                &uv_intra_distortion_8x8,
-                                &uv_intra_skippable_8x8);
-    uv_intra_mode_8x8 = mbmi->uv_mode;
-  }
-
-  // Get estimates of reference frame costs for each reference frame
-  // that depend on the current prediction etc.
-  estimate_ref_frame_costs(cpi, segment_id, ref_costs);
-
-  for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
-    int64_t this_rd = INT64_MAX;
-    int disable_skip = 0, skippable = 0;
-    int other_cost = 0;
-    int compmode_cost = 0;
-#if CONFIG_COMP_INTERINTRA_PRED
-    int compmode_interintra_cost = 0;
-#endif
-    int mode_excluded = 0;
-    int64_t txfm_cache[NB_TXFM_MODES] = { 0 };
-    YV12_BUFFER_CONFIG *scaled_ref_frame;
-
-    // These variables hold are rolling total cost and distortion for this mode
-    rate2 = 0;
-    distortion2 = 0;
-    rate_y = 0;
-    rate_uv = 0;
-
-    x->skip = 0;
-
-    this_mode = vp9_mode_order[mode_index].mode;
-    mbmi->mode = this_mode;
-    mbmi->uv_mode = DC_PRED;
-    mbmi->ref_frame = vp9_mode_order[mode_index].ref_frame;
-    mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
-
-    mbmi->interp_filter = cm->mcomp_filter_type;
-
-    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
-                      scale_factor);
-
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-
-    // Test best rd so far against threshold for trying this mode.
-    if (best_rd <= cpi->rd_threshes[mode_index])
-      continue;
-
-    // Ensure that the references used by this mode are available.
-    if (mbmi->ref_frame &&
-        !(cpi->ref_frame_flags & flag_list[mbmi->ref_frame]))
-      continue;
-
-    if (mbmi->second_ref_frame > 0 &&
-        !(cpi->ref_frame_flags & flag_list[mbmi->second_ref_frame]))
-      continue;
-
-    // only scale on zeromv.
-    if (mbmi->ref_frame > 0 &&
-          (yv12_mb[mbmi->ref_frame].y_width != cm->mb_cols * 16 ||
-           yv12_mb[mbmi->ref_frame].y_height != cm->mb_rows * 16) &&
-        this_mode != ZEROMV)
-      continue;
-
-    if (mbmi->second_ref_frame > 0 &&
-          (yv12_mb[mbmi->second_ref_frame].y_width != cm->mb_cols * 16 ||
-           yv12_mb[mbmi->second_ref_frame].y_height != cm->mb_rows * 16) &&
-        this_mode != ZEROMV)
-      continue;
-
-    // current coding mode under rate-distortion optimization test loop
-#if CONFIG_COMP_INTERINTRA_PRED
-    mbmi->interintra_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-    mbmi->interintra_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
-
-    // If the segment reference frame feature is enabled....
-    // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-        !vp9_check_segref(xd, segment_id, mbmi->ref_frame)) {
-      continue;
-    // If the segment skip feature is enabled....
-    // then do nothing if the current mode is not allowed..
-    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) &&
-               (this_mode != ZEROMV)) {
-      continue;
-    // Disable this drop out case if  the ref frame segment
-    // level feature is enabled for this segment. This is to
-    // prevent the possibility that the we end up unable to pick any mode.
-    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) {
-      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame overlay,
-      // unless ARNR filtering is enabled in which case we want
-      // an unfiltered alternative
-      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-        if (this_mode != ZEROMV ||
-            mbmi->ref_frame != ALTREF_FRAME) {
-          continue;
-        }
-      }
-    }
-
-    /* everything but intra */
-    scaled_ref_frame = NULL;
-    if (mbmi->ref_frame) {
-      int ref = mbmi->ref_frame;
-      int fb;
-
-      xd->pre = yv12_mb[ref];
-      best_ref_mv = mbmi->ref_mvs[ref][0];
-      vpx_memcpy(mdcounts, frame_mdcounts[ref], sizeof(mdcounts));
-
-      if (mbmi->ref_frame == LAST_FRAME) {
-        fb = cpi->lst_fb_idx;
-      } else if (mbmi->ref_frame == GOLDEN_FRAME) {
-        fb = cpi->gld_fb_idx;
-      } else {
-        fb = cpi->alt_fb_idx;
-      }
-
-      if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
-        scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
-    }
-
-    if (mbmi->second_ref_frame > 0) {
-      int ref = mbmi->second_ref_frame;
-
-      xd->second_pre = yv12_mb[ref];
-      second_best_ref_mv = mbmi->ref_mvs[ref][0];
-    }
-
-    // Experimental code. Special case for gf and arf zeromv modes.
-    // Increase zbin size to suppress noise
-    if (cpi->zbin_mode_boost_enabled) {
-      if (vp9_mode_order[mode_index].ref_frame == INTRA_FRAME)
-        cpi->zbin_mode_boost = 0;
-      else {
-        if (vp9_mode_order[mode_index].mode == ZEROMV) {
-          if (vp9_mode_order[mode_index].ref_frame != LAST_FRAME)
-            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
-          else
-            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
-        } else if (vp9_mode_order[mode_index].mode == SPLITMV)
-          cpi->zbin_mode_boost = 0;
-        else
-          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
-      }
-
-      vp9_update_zbin_extra(cpi, x);
-    }
-
-    // Intra
-    if (!mbmi->ref_frame) {
-      switch (this_mode) {
-        default:
-        case V_PRED:
-        case H_PRED:
-        case D45_PRED:
-        case D135_PRED:
-        case D117_PRED:
-        case D153_PRED:
-        case D27_PRED:
-        case D63_PRED:
-          rate2 += intra_cost_penalty;
-        case DC_PRED:
-        case TM_PRED:
-          mbmi->ref_frame = INTRA_FRAME;
-          // FIXME compound intra prediction
-          vp9_build_intra_predictors_mby(&x->e_mbd);
-          macro_block_yrd(cpi, x, &rate_y, &distortion, &skippable, txfm_cache);
-          rate2 += rate_y;
-          distortion2 += distortion;
-          rate2 += x->mbmode_cost[xd->frame_type][mbmi->mode];
-          if (mbmi->txfm_size != TX_4X4) {
-            rate2 += uv_intra_rate_8x8;
-            rate_uv = uv_intra_rate_tokenonly_8x8;
-            distortion2 += uv_intra_distortion_8x8;
-            distortion_uv = uv_intra_distortion_8x8;
-            skippable = skippable && uv_intra_skippable_8x8;
-          } else {
-            rate2 += uv_intra_rate;
-            rate_uv = uv_intra_rate_tokenonly;
-            distortion2 += uv_intra_distortion;
-            distortion_uv = uv_intra_distortion;
-            skippable = skippable && uv_intra_skippable;
-          }
-          break;
-        case B_PRED: {
-          int64_t tmp_rd;
-
-          // Note the rate value returned here includes the cost of coding
-          // the BPRED mode : x->mbmode_cost[xd->frame_type][BPRED];
-          mbmi->txfm_size = TX_4X4;
-          tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
-                                             &distortion, best_yrd);
-          rate2 += rate;
-          rate2 += intra_cost_penalty;
-          distortion2 += distortion;
-
-          if (tmp_rd < best_yrd) {
-            rate2 += uv_intra_rate;
-            rate_uv = uv_intra_rate_tokenonly;
-            distortion2 += uv_intra_distortion;
-            distortion_uv = uv_intra_distortion;
-          } else {
-            this_rd = INT64_MAX;
-            disable_skip = 1;
-          }
-        }
-        break;
-        case I8X8_PRED: {
-          int64_t tmp_rd;
-
-          tmp_rd = rd_pick_intra8x8mby_modes_and_txsz(cpi, x, &rate, &rate_y,
-                                                      &distortion, mode8x8,
-                                                      best_yrd, txfm_cache);
-          rate2 += rate;
-          rate2 += intra_cost_penalty;
-          distortion2 += distortion;
-
-          /* TODO: uv rate maybe over-estimated here since there is UV intra
-                   mode coded in I8X8_PRED prediction */
-          if (tmp_rd < best_yrd) {
-            rate2 += uv_intra_rate;
-            rate_uv = uv_intra_rate_tokenonly;
-            distortion2 += uv_intra_distortion;
-            distortion_uv = uv_intra_distortion;
-          } else {
-            this_rd = INT64_MAX;
-            disable_skip = 1;
-          }
-        }
-        break;
-      }
-    }
-    // Split MV. The code is very different from the other inter modes so
-    // special case it.
-    else if (this_mode == SPLITMV) {
-      const int is_comp_pred = mbmi->second_ref_frame > 0;
-      int64_t this_rd_thresh;
-      int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
-      int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
-      int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0;
-      int switchable_filter_index;
-      int_mv *second_ref = is_comp_pred ? &second_best_ref_mv : NULL;
-      union b_mode_info tmp_best_bmodes[16];
-      MB_MODE_INFO tmp_best_mbmode;
-      PARTITION_INFO tmp_best_partition;
-      int pred_exists = 0;
-
-      this_rd_thresh =
-          (mbmi->ref_frame == LAST_FRAME) ?
-          cpi->rd_threshes[THR_NEWMV] : cpi->rd_threshes[THR_NEWA];
-      this_rd_thresh =
-          (mbmi->ref_frame == GOLDEN_FRAME) ?
-          cpi->rd_threshes[THR_NEWG] : this_rd_thresh;
-      xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-
-      for (switchable_filter_index = 0;
-           switchable_filter_index < VP9_SWITCHABLE_FILTERS;
-           ++switchable_filter_index) {
-        int newbest;
-        mbmi->interp_filter =
-            vp9_switchable_interp[switchable_filter_index];
-        vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-
-        tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
-                                             second_ref, best_yrd, mdcounts,
-                                             &rate, &rate_y, &distortion,
-                                             &skippable,
-                                             (int)this_rd_thresh, seg_mvs,
-                                             txfm_cache);
-        if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-          int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
-                   [vp9_get_pred_context(&cpi->common, xd,
-                                         PRED_SWITCHABLE_INTERP)]
-                   [vp9_switchable_interp_map[mbmi->interp_filter]];
-          tmp_rd += RDCOST(x->rdmult, x->rddiv, rs, 0);
-        }
-        newbest = (tmp_rd < tmp_best_rd);
-        if (newbest) {
-          tmp_best_filter = mbmi->interp_filter;
-          tmp_best_rd = tmp_rd;
-        }
-        if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||
-            (mbmi->interp_filter == cm->mcomp_filter_type &&
-             cm->mcomp_filter_type != SWITCHABLE)) {
-          tmp_best_rdu = tmp_rd;
-          tmp_best_rate = rate;
-          tmp_best_ratey = rate_y;
-          tmp_best_distortion = distortion;
-          tmp_best_skippable = skippable;
-          vpx_memcpy(&tmp_best_mbmode, mbmi, sizeof(MB_MODE_INFO));
-          vpx_memcpy(&tmp_best_partition, x->partition_info,
-                     sizeof(PARTITION_INFO));
-          for (i = 0; i < 16; i++) {
-            tmp_best_bmodes[i] = xd->block[i].bmi;
-          }
-          pred_exists = 1;
-        }
-      }  // switchable_filter_index loop
-
-      mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?
-                             tmp_best_filter : cm->mcomp_filter_type);
-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-      if (!pred_exists) {
-        // Handles the special case when a filter that is not in the
-        // switchable list (bilinear, 6-tap) is indicated at the frame level
-        tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
-                                             second_ref, best_yrd, mdcounts,
-                                             &rate, &rate_y, &distortion,
-                                             &skippable,
-                                             (int)this_rd_thresh, seg_mvs,
-                                             txfm_cache);
-      } else {
-        if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-          int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
-                   [vp9_get_pred_context(&cpi->common, xd,
-                                         PRED_SWITCHABLE_INTERP)]
-                   [vp9_switchable_interp_map[mbmi->interp_filter]];
-          tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
-        }
-        tmp_rd = tmp_best_rdu;
-        rate = tmp_best_rate;
-        rate_y = tmp_best_ratey;
-        distortion = tmp_best_distortion;
-        skippable = tmp_best_skippable;
-        vpx_memcpy(mbmi, &tmp_best_mbmode, sizeof(MB_MODE_INFO));
-        vpx_memcpy(x->partition_info, &tmp_best_partition,
-                   sizeof(PARTITION_INFO));
-        for (i = 0; i < 16; i++) {
-          xd->block[i].bmi = xd->mode_info_context->bmi[i] = tmp_best_bmodes[i];
-        }
-      }
-
-      rate2 += rate;
-      distortion2 += distortion;
-
-      if (cpi->common.mcomp_filter_type == SWITCHABLE)
-        rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
-            [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
-            [vp9_switchable_interp_map[mbmi->interp_filter]];
-
-      // If even the 'Y' rd value of split is higher than best so far
-      // then dont bother looking at UV
-      if (tmp_rd < best_yrd) {
-        int uv_skippable;
-
-        vp9_build_inter4x4_predictors_mbuv(&x->e_mbd, mb_row, mb_col);
-        vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                          x->e_mbd.predictor, x->src.uv_stride);
-        rd_inter16x16_uv_4x4(cpi, x, &rate_uv, &distortion_uv,
-                             cpi->common.full_pixel, &uv_skippable, 1);
-        rate2 += rate_uv;
-        distortion2 += distortion_uv;
-        skippable = skippable && uv_skippable;
-      } else {
-        this_rd = INT64_MAX;
-        disable_skip = 1;
-      }
-
-      if (!mode_excluded) {
-        if (is_comp_pred)
-          mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
-        else
-          mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
-      }
-
-      compmode_cost =
-        vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
-      mbmi->mode = this_mode;
-    }
-    else {
-#if CONFIG_COMP_INTERINTRA_PRED
-      if (mbmi->second_ref_frame == INTRA_FRAME) {
-        if (best_intra16_mode == DC_PRED - 1) continue;
-        mbmi->interintra_mode = best_intra16_mode;
-#if SEPARATE_INTERINTRA_UV
-        mbmi->interintra_uv_mode = best_intra16_uv_mode;
-#else
-        mbmi->interintra_uv_mode = best_intra16_mode;
-#endif
-      }
-#endif
-      this_rd = handle_inter_mode(cpi, x, BLOCK_16X16,
-                                  &saddone, near_sadidx, mdcounts, txfm_cache,
-                                  &rate2, &distortion2, &skippable,
-                                  &compmode_cost,
-#if CONFIG_COMP_INTERINTRA_PRED
-                                  &compmode_interintra_cost,
-#endif
-                                  &rate_y, &distortion,
-                                  &rate_uv, &distortion_uv,
-                                  &mode_excluded, &disable_skip,
-                                  mode_index, &tmp_best_filter, frame_mv,
-                                  scaled_ref_frame, mb_row, mb_col);
-      if (this_rd == INT64_MAX)
-        continue;
-    }
-
-#if CONFIG_COMP_INTERINTRA_PRED
-    if (cpi->common.use_interintra)
-      rate2 += compmode_interintra_cost;
-#endif
-
-    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION)
-      rate2 += compmode_cost;
-
-    // Estimate the reference frame signaling cost and add it
-    // to the rolling cost variable.
-    rate2 += ref_costs[mbmi->ref_frame];
-
-    if (!disable_skip) {
-      // Test for the condition where skip block will be activated
-      // because there are no non zero coefficients and make any
-      // necessary adjustment for rate. Ignore if skip is coded at
-      // segment level as the cost wont have been added in.
-      if (cpi->common.mb_no_coeff_skip) {
-        int mb_skip_allowed;
-
-        // Is Mb level skip allowed (i.e. not coded at segment level).
-        mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
-
-        if (skippable) {
-          mbmi->mb_skip_coeff = 1;
-
-          // Back out the coefficient coding costs
-          rate2 -= (rate_y + rate_uv);
-          // for best_yrd calculation
-          rate_uv = 0;
-
-          if (mb_skip_allowed) {
-            int prob_skip_cost;
-
-            // Cost the skip mb case
-            vp9_prob skip_prob =
-              vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP);
-
-            if (skip_prob) {
-              prob_skip_cost = vp9_cost_bit(skip_prob, 1);
-              rate2 += prob_skip_cost;
-              other_cost += prob_skip_cost;
-            }
-          }
-        }
-        // Add in the cost of the no skip flag.
-        else {
-          mbmi->mb_skip_coeff = 0;
-          if (mb_skip_allowed) {
-            int prob_skip_cost = vp9_cost_bit(
-                   vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP), 0);
-            rate2 += prob_skip_cost;
-            other_cost += prob_skip_cost;
-          }
-        }
-      }
-
-      // Calculate the final RD estimate for this mode.
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-    }
-
-    // Keep record of best intra distortion
-    if ((mbmi->ref_frame == INTRA_FRAME) &&
-        (this_rd < best_intra_rd)) {
-      best_intra_rd = this_rd;
-      *returnintra = distortion2;
-    }
-#if CONFIG_COMP_INTERINTRA_PRED
-    if ((mbmi->ref_frame == INTRA_FRAME) &&
-        (this_mode <= TM_PRED) &&
-        (this_rd < best_intra16_rd)) {
-      best_intra16_rd = this_rd;
-      best_intra16_mode = this_mode;
-#if SEPARATE_INTERINTRA_UV
-      best_intra16_uv_mode = (mbmi->txfm_size != TX_4X4 ?
-                              uv_intra_mode_8x8 : uv_intra_mode);
-#endif
-    }
-#endif
-
-    if (!disable_skip && mbmi->ref_frame == INTRA_FRAME)
-      for (i = 0; i < NB_PREDICTION_TYPES; ++i)
-        best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
-
-    if (this_rd < best_overall_rd) {
-      best_overall_rd = this_rd;
-      best_filter = tmp_best_filter;
-      best_mode = this_mode;
-#if CONFIG_COMP_INTERINTRA_PRED
-      is_best_interintra = (mbmi->second_ref_frame == INTRA_FRAME);
-#endif
-    }
-
-    // Did this mode help.. i.e. is it the new best mode
-    if (this_rd < best_rd || x->skip) {
-      if (!mode_excluded) {
-        /*
-        if (mbmi->second_ref_frame == INTRA_FRAME) {
-          printf("rd %d best %d bestintra16 %d\n", this_rd, best_rd, best_intra16_rd);
-        }
-        */
-        // Note index of best mode so far
-        best_mode_index = mode_index;
-
-        if (this_mode <= B_PRED) {
-          if (mbmi->txfm_size != TX_4X4
-              && this_mode != B_PRED
-              && this_mode != I8X8_PRED)
-            mbmi->uv_mode = uv_intra_mode_8x8;
-          else
-            mbmi->uv_mode = uv_intra_mode;
-          /* required for left and above block mv */
-          mbmi->mv[0].as_int = 0;
-        }
-
-        other_cost += ref_costs[mbmi->ref_frame];
-
-        /* Calculate the final y RD estimate for this mode */
-        best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),
-                          (distortion2 - distortion_uv));
-
-        *returnrate = rate2;
-        *returndistortion = distortion2;
-        best_rd = this_rd;
-        vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));
-        vpx_memcpy(&best_partition, x->partition_info, sizeof(PARTITION_INFO));
-
-        if ((this_mode == B_PRED)
-            || (this_mode == I8X8_PRED)
-            || (this_mode == SPLITMV))
-          for (i = 0; i < 16; i++) {
-            best_bmodes[i] = xd->block[i].bmi;
-          }
-      }
-
-      // Testing this mode gave rise to an improvement in best error score.
-      // Lower threshold a bit for next time
-      cpi->rd_thresh_mult[mode_index] =
-          (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
-          cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
-      cpi->rd_threshes[mode_index] =
-          (cpi->rd_baseline_thresh[mode_index] >> 7) *
-          cpi->rd_thresh_mult[mode_index];
-    } else {
-      // If the mode did not help improve the best error case then raise the
-      // threshold for testing that mode next time around.
-      cpi->rd_thresh_mult[mode_index] += 4;
-
-      if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
-        cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
-
-      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7)
-          * cpi->rd_thresh_mult[mode_index];
-    }
-
-    /* keep record of best compound/single-only prediction */
-    if (!disable_skip && mbmi->ref_frame != INTRA_FRAME) {
-      int64_t single_rd, hybrid_rd;
-      int single_rate, hybrid_rate;
-
-      if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
-        single_rate = rate2 - compmode_cost;
-        hybrid_rate = rate2;
-      } else {
-        single_rate = rate2;
-        hybrid_rate = rate2 + compmode_cost;
-      }
-
-      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
-      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
-
-      if (mbmi->second_ref_frame <= INTRA_FRAME &&
-          single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
-        best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
-      } else if (mbmi->second_ref_frame > INTRA_FRAME &&
-                 single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
-        best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
-      }
-      if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
-        best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
-    }
-
-    /* keep record of best txfm size */
-    if (!mode_excluded && this_rd != INT64_MAX) {
-      for (i = 0; i < NB_TXFM_MODES; i++) {
-        int64_t adj_rd;
-        if (this_mode != B_PRED) {
-          const int64_t txfm_mode_diff =
-              txfm_cache[i] - txfm_cache[cm->txfm_mode];
-          adj_rd = this_rd + txfm_mode_diff;
-        } else {
-          adj_rd = this_rd;
-        }
-        if (adj_rd < best_txfm_rd[i])
-          best_txfm_rd[i] = adj_rd;
-      }
-    }
-
-    if (x->skip && !mode_excluded)
-      break;
-  }
-
-  assert((cm->mcomp_filter_type == SWITCHABLE) ||
-         (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
-         (best_mbmode.mode <= B_PRED));
-
-#if CONFIG_COMP_INTERINTRA_PRED
-  ++cpi->interintra_select_count[is_best_interintra];
-#endif
-
-  // Accumulate filter usage stats
-  // TODO(agrange): Use RD criteria to select interpolation filter mode.
-  if ((best_mode >= NEARESTMV) && (best_mode <= SPLITMV))
-    ++cpi->best_switchable_interp_count[vp9_switchable_interp_map[best_filter]];
-
-  // Reduce the activation RD thresholds for the best choice mode
-  if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
-      (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
-    int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
-
-    cpi->rd_thresh_mult[best_mode_index] =
-        (cpi->rd_thresh_mult[best_mode_index] >=
-         (MIN_THRESHMULT + best_adjustment)) ?
-        cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
-    cpi->rd_threshes[best_mode_index] =
-        (cpi->rd_baseline_thresh[best_mode_index] >> 7) *
-        cpi->rd_thresh_mult[best_mode_index];
-  }
-
-  // This code forces Altref,0,0 and skip for the frame that overlays a
-  // an alrtef unless Altref is filtered. However, this is unsafe if
-  // segment level coding of ref frame is enabled for this
-  // segment.
-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-      cpi->is_src_frame_alt_ref &&
-      (cpi->oxcf.arnr_max_frames == 0) &&
-      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
-    mbmi->mode = ZEROMV;
-    if (cm->txfm_mode <= ALLOW_8X8)
-      mbmi->txfm_size = cm->txfm_mode;
-    else
-      mbmi->txfm_size = TX_16X16;
-    mbmi->ref_frame = ALTREF_FRAME;
-    mbmi->mv[0].as_int = 0;
-    mbmi->uv_mode = DC_PRED;
-    mbmi->mb_skip_coeff =
-      (cpi->common.mb_no_coeff_skip) ? 1 : 0;
-    mbmi->partitioning = 0;
-    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
-                      scale_factor);
-
-    vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));
-    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
-    goto end;
-  }
-
-  // macroblock modes
-  vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
-  if (best_mbmode.mode == B_PRED) {
-    for (i = 0; i < 16; i++) {
-      xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
-      xd->block[i].bmi.as_mode = xd->mode_info_context->bmi[i].as_mode;
-    }
-  }
-
-  if (best_mbmode.mode == I8X8_PRED)
-    set_i8x8_block_modes(x, mode8x8);
-
-  if (best_mbmode.mode == SPLITMV) {
-    for (i = 0; i < 16; i++)
-      xd->mode_info_context->bmi[i].as_mv[0].as_int =
-          best_bmodes[i].as_mv[0].as_int;
-    if (mbmi->second_ref_frame > 0)
-      for (i = 0; i < 16; i++)
-        xd->mode_info_context->bmi[i].as_mv[1].as_int =
-            best_bmodes[i].as_mv[1].as_int;
-
-    vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));
-
-    mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;
-    mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;
-  }
-
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
-    if (best_pred_rd[i] == INT64_MAX)
-      best_pred_diff[i] = INT_MIN;
-    else
-      best_pred_diff[i] = best_rd - best_pred_rd[i];
-  }
-
-  if (!x->skip) {
-    for (i = 0; i < NB_TXFM_MODES; i++) {
-      if (best_txfm_rd[i] == INT64_MAX)
-        best_txfm_diff[i] = 0;
-      else
-        best_txfm_diff[i] = best_rd - best_txfm_rd[i];
-    }
-  } else {
-    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
-  }
-
-end:
-  set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
-                    scale_factor);
-  store_coding_context(x, &x->mb_context[xd->sb_index][xd->mb_index],
-                       best_mode_index, &best_partition,
-                       &mbmi->ref_mvs[mbmi->ref_frame][0],
-                       &mbmi->ref_mvs[mbmi->second_ref_frame < 0 ? 0 :
-                                      mbmi->second_ref_frame][0],
-                       best_pred_diff, best_txfm_diff);
-}
-
-void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
-                                 int *returnrate,
-                                 int *returndist) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
   int rate_y = 0, rate_uv;
   int rate_y_tokenonly = 0, rate_uv_tokenonly;
   int dist_y = 0, dist_uv;
   int y_skip = 0, uv_skip;
   int64_t txfm_cache[NB_TXFM_MODES], err;
+  MB_PREDICTION_MODE mode;
+  TX_SIZE txfm_size;
+  int rate4x4_y, rate4x4_y_tokenonly, dist4x4_y;
+  int64_t err4x4 = INT64_MAX;
   int i;
 
+  vpx_memset(&txfm_cache,0,sizeof(txfm_cache));
+  ctx->skip = 0;
   xd->mode_info_context->mbmi.mode = DC_PRED;
+  xd->mode_info_context->mbmi.ref_frame[0] = INTRA_FRAME;
   err = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
-                               &dist_y, &y_skip, txfm_cache);
+                               &dist_y, &y_skip, bsize, txfm_cache);
+  mode = xd->mode_info_context->mbmi.mode;
+  txfm_size = xd->mode_info_context->mbmi.txfm_size;
   rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
-                          &dist_uv, &uv_skip);
+                          &dist_uv, &uv_skip,
+                          (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :
+                                                       bsize);
+  if (bsize < BLOCK_SIZE_SB8X8)
+    err4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4_y,
+                                       &rate4x4_y_tokenonly,
+                                       &dist4x4_y, err);
 
-  if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {
+  if (y_skip && uv_skip) {
     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
                   vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
     *returndist = dist_y + (dist_uv >> 2);
-    memset(x->sb32_context[xd->sb_index].txfm_rd_diff, 0,
-           sizeof(x->sb32_context[xd->sb_index].txfm_rd_diff));
+    memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));
+    xd->mode_info_context->mbmi.mode = mode;
+    xd->mode_info_context->mbmi.txfm_size = txfm_size;
+  } else if (bsize < BLOCK_SIZE_SB8X8 && err4x4 < err) {
+    *returnrate = rate4x4_y + rate_uv +
+        vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
+    *returndist = dist4x4_y + (dist_uv >> 2);
+    vpx_memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));
+    xd->mode_info_context->mbmi.txfm_size = TX_4X4;
   } else {
-    *returnrate = rate_y + rate_uv;
-    if (cpi->common.mb_no_coeff_skip)
-      *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
+    *returnrate = rate_y + rate_uv +
+        vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
     *returndist = dist_y + (dist_uv >> 2);
     for (i = 0; i < NB_TXFM_MODES; i++) {
-      x->sb32_context[xd->sb_index].txfm_rd_diff[i] = err - txfm_cache[i];
+      ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->txfm_mode];
     }
+    xd->mode_info_context->mbmi.txfm_size = txfm_size;
+    xd->mode_info_context->mbmi.mode = mode;
   }
-}
 
-void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
-                                 int *returnrate,
-                                 int *returndist) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int rate_y = 0, rate_uv;
-  int rate_y_tokenonly = 0, rate_uv_tokenonly;
-  int dist_y = 0, dist_uv;
-  int y_skip = 0, uv_skip;
-  int64_t txfm_cache[NB_TXFM_MODES], err;
-  int i;
-
-  xd->mode_info_context->mbmi.mode = DC_PRED;
-  err = rd_pick_intra_sb64y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
-                                 &dist_y, &y_skip, txfm_cache);
-  rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
-                            &dist_uv, &uv_skip);
-
-  if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {
-    *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
-    vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
-    *returndist = dist_y + (dist_uv >> 2);
-    memset(x->sb64_context.txfm_rd_diff, 0,
-           sizeof(x->sb64_context.txfm_rd_diff));
-  } else {
-    *returnrate = rate_y + rate_uv;
-    if (cm->mb_no_coeff_skip)
-      *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
-    *returndist = dist_y + (dist_uv >> 2);
-    for (i = 0; i < NB_TXFM_MODES; i++) {
-      x->sb64_context.txfm_rd_diff[i] = err - txfm_cache[i];
-    }
-  }
+  ctx->mic = *xd->mode_info_context;
 }
 
-void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                            int *returnrate, int *returndist) {
+int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+                                  int mi_row, int mi_col,
+                                  int *returnrate,
+                                  int *returndistortion,
+                                  BLOCK_SIZE_TYPE bsize,
+                                  PICK_MODE_CONTEXT *ctx) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  int64_t error4x4, error16x16;
-  int rate4x4, rate16x16 = 0, rateuv, rateuv8x8;
-  int dist4x4 = 0, dist16x16 = 0, distuv = 0, distuv8x8 = 0;
-  int rate;
-  int rate4x4_tokenonly = 0;
-  int rate16x16_tokenonly = 0;
-  int rateuv_tokenonly = 0, rateuv8x8_tokenonly = 0;
-  int64_t error8x8;
-  int rate8x8_tokenonly=0;
-  int rate8x8, dist8x8;
-  int mode16x16;
-  int mode8x8[4];
-  int dist;
-  int modeuv, modeuv8x8, uv_intra_skippable, uv_intra_skippable_8x8;
-  int y_intra16x16_skippable = 0;
-  int64_t txfm_cache[2][NB_TXFM_MODES];
-  TX_SIZE txfm_size_16x16, txfm_size_8x8;
-  int i;
-
-  mbmi->ref_frame = INTRA_FRAME;
-  mbmi->mode = DC_PRED;
-  rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv,
-                          &uv_intra_skippable);
-  modeuv = mbmi->uv_mode;
-  if (cpi->common.txfm_mode != ONLY_4X4) {
-    rd_pick_intra_mbuv_mode_8x8(cpi, x, &rateuv8x8, &rateuv8x8_tokenonly,
-                                &distuv8x8, &uv_intra_skippable_8x8);
-    modeuv8x8 = mbmi->uv_mode;
-  } else {
-    uv_intra_skippable_8x8 = uv_intra_skippable;
-    rateuv8x8 = rateuv;
-    distuv8x8 = distuv;
-    rateuv8x8_tokenonly = rateuv_tokenonly;
-    modeuv8x8 = modeuv;
-  }
-
-  // current macroblock under rate-distortion optimization test loop
-  error16x16 = rd_pick_intra16x16mby_mode(cpi, x, &rate16x16,
-                                          &rate16x16_tokenonly, &dist16x16,
-                                          &y_intra16x16_skippable,
-                                          txfm_cache[1]);
-  mode16x16 = mbmi->mode;
-  txfm_size_16x16 = mbmi->txfm_size;
-  if (cpi->common.mb_no_coeff_skip && y_intra16x16_skippable &&
-      ((cm->txfm_mode == ONLY_4X4 && uv_intra_skippable) ||
-       (cm->txfm_mode != ONLY_4X4 && uv_intra_skippable_8x8))) {
-    error16x16 -= RDCOST(x->rdmult, x->rddiv, rate16x16_tokenonly, 0);
-    rate16x16 -= rate16x16_tokenonly;
-  }
-  for (i = 0; i < NB_TXFM_MODES; i++) {
-    txfm_cache[0][i] = error16x16 - txfm_cache[1][cm->txfm_mode] +
-                       txfm_cache[1][i];
-  }
-
-  error8x8 = rd_pick_intra8x8mby_modes_and_txsz(cpi, x, &rate8x8,
-                                                &rate8x8_tokenonly,
-                                                &dist8x8, mode8x8,
-                                                error16x16, txfm_cache[1]);
-  txfm_size_8x8 = mbmi->txfm_size;
-  for (i = 0; i < NB_TXFM_MODES; i++) {
-    int64_t tmp_rd = error8x8 - txfm_cache[1][cm->txfm_mode] + txfm_cache[1][i];
-    if (tmp_rd < txfm_cache[0][i])
-      txfm_cache[0][i] = tmp_rd;
-  }
-
-  mbmi->txfm_size = TX_4X4;
-  error4x4 = rd_pick_intra4x4mby_modes(cpi, x,
-                                       &rate4x4, &rate4x4_tokenonly,
-                                       &dist4x4, error16x16);
-  for (i = 0; i < NB_TXFM_MODES; i++) {
-    if (error4x4 < txfm_cache[0][i])
-      txfm_cache[0][i] = error4x4;
-  }
-
-  mbmi->mb_skip_coeff = 0;
-  if (cpi->common.mb_no_coeff_skip && y_intra16x16_skippable &&
-      ((cm->txfm_mode == ONLY_4X4 && uv_intra_skippable) ||
-       (cm->txfm_mode != ONLY_4X4 && uv_intra_skippable_8x8))) {
-    mbmi->mb_skip_coeff = 1;
-    mbmi->mode = mode16x16;
-    mbmi->uv_mode = (cm->txfm_mode == ONLY_4X4) ? modeuv : modeuv8x8;
-    rate = rate16x16 + vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
-    dist = dist16x16;
-    if (cm->txfm_mode == ONLY_4X4) {
-      rate += rateuv - rateuv_tokenonly;
-      dist += (distuv >> 2);
-    } else {
-      rate += rateuv8x8 - rateuv8x8_tokenonly;
-      dist += (distuv8x8 >> 2);
-    }
-
-    mbmi->txfm_size = txfm_size_16x16;
-  } else if (error8x8 > error16x16) {
-    if (error4x4 < error16x16) {
-      rate = rateuv + rate4x4;
-      mbmi->mode = B_PRED;
-      mbmi->txfm_size = TX_4X4;
-      dist = dist4x4 + (distuv >> 2);
-    } else {
-      mbmi->txfm_size = txfm_size_16x16;
-      mbmi->mode = mode16x16;
-      rate = rate16x16 + rateuv8x8;
-      dist = dist16x16 + (distuv8x8 >> 2);
-    }
-    if (cpi->common.mb_no_coeff_skip)
-      rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
-  } else {
-    if (error4x4 < error8x8) {
-      rate = rateuv + rate4x4;
-      mbmi->mode = B_PRED;
-      mbmi->txfm_size = TX_4X4;
-      dist = dist4x4 + (distuv >> 2);
-    } else {
-      mbmi->mode = I8X8_PRED;
-      mbmi->txfm_size = txfm_size_8x8;
-      set_i8x8_block_modes(x, mode8x8);
-      rate = rate8x8 + rateuv;
-      dist = dist8x8 + (distuv >> 2);
-    }
-    if (cpi->common.mb_no_coeff_skip)
-      rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
-  }
-
-  for (i = 0; i < NB_TXFM_MODES; i++) {
-    x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff[i] =
-        txfm_cache[0][cm->txfm_mode] - txfm_cache[0][i];
-  }
-
-  *returnrate = rate;
-  *returndist = dist;
-}
-
-static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                                         int mb_row, int mb_col,
-                                         int *returnrate,
-                                         int *returndistortion,
-                                         int block_size) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]);
   MB_PREDICTION_MODE this_mode;
   MB_PREDICTION_MODE best_mode = DC_PRED;
   MV_REFERENCE_FRAME ref_frame;
@@ -5186,8 +2445,8 @@
   unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
   int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  int frame_mdcounts[4][4];
-  YV12_BUFFER_CONFIG yv12_mb[4];
+  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  int_mv single_newmv[MAX_REF_FRAMES];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
   int idx_list[4] = {0,
@@ -5194,9 +2453,6 @@
                      cpi->lst_fb_idx,
                      cpi->gld_fb_idx,
                      cpi->alt_fb_idx};
-  int mdcounts[4];
-  int near_sadidx[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
-  int saddone = 0;
   int64_t best_rd = INT64_MAX;
   int64_t best_txfm_rd[NB_TXFM_MODES];
   int64_t best_txfm_diff[NB_TXFM_MODES];
@@ -5205,32 +2461,44 @@
   MB_MODE_INFO best_mbmode;
   int j;
   int mode_index, best_mode_index = 0;
-  unsigned int ref_costs[MAX_REF_FRAMES];
-#if CONFIG_COMP_INTERINTRA_PRED
-  int is_best_interintra = 0;
-  int64_t best_intra16_rd = INT64_MAX;
-  int best_intra16_mode = DC_PRED;
-#if SEPARATE_INTERINTRA_UV
-  int best_intra16_uv_mode = DC_PRED;
-#endif
-#endif
+  unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
+  vp9_prob comp_mode_p;
   int64_t best_overall_rd = INT64_MAX;
   INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
   INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
-  int rate_uv_4x4 = 0, rate_uv_8x8 = 0, rate_uv_tokenonly_4x4 = 0,
-      rate_uv_tokenonly_8x8 = 0;
-  int dist_uv_4x4 = 0, dist_uv_8x8 = 0, uv_skip_4x4 = 0, uv_skip_8x8 = 0;
-  MB_PREDICTION_MODE mode_uv_4x4 = NEARESTMV, mode_uv_8x8 = NEARESTMV;
-  int rate_uv_16x16 = 0, rate_uv_tokenonly_16x16 = 0;
-  int dist_uv_16x16 = 0, uv_skip_16x16 = 0;
-  MB_PREDICTION_MODE mode_uv_16x16 = NEARESTMV;
+  int rate_uv_intra[TX_SIZE_MAX_SB], rate_uv_tokenonly[TX_SIZE_MAX_SB];
+  int dist_uv[TX_SIZE_MAX_SB], skip_uv[TX_SIZE_MAX_SB];
+  MB_PREDICTION_MODE mode_uv[TX_SIZE_MAX_SB];
   struct scale_factors scale_factor[4];
   unsigned int ref_frame_mask = 0;
   unsigned int mode_mask = 0;
+  int64_t mode_distortions[MB_MODE_COUNT] = {-1};
+  int64_t frame_distortions[MAX_REF_FRAMES] = {-1};
+  int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
+                                             cpi->common.y_dc_delta_q);
+  int_mv seg_mvs[4][MAX_REF_FRAMES];
+  union b_mode_info best_bmodes[4];
+  PARTITION_INFO best_partition;
+  int bwsl = b_width_log2(bsize);
+  int bws = (1 << bwsl) / 4;  // mode_info step for subsize
+  int bhsl = b_height_log2(bsize);
+  int bhs = (1 << bhsl) / 4;  // mode_info step for subsize
 
+  for (i = 0; i < 4; i++) {
+    int j;
+
+    for (j = 0; j < MAX_REF_FRAMES; j++)
+      seg_mvs[i][j].as_int = INVALID_MV;
+  }
+  // Everywhere the flag is set the error is much higher than its neighbors.
+  ctx->frames_with_high_error = 0;
+  ctx->modes_with_high_error = 0;
+
   xd->mode_info_context->mbmi.segment_id = segment_id;
-  estimate_ref_frame_costs(cpi, segment_id, ref_costs);
+  estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
+                           &comp_mode_p);
   vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
+  vpx_memset(&single_newmv, 0, sizeof(single_newmv));
 
   for (i = 0; i < NB_PREDICTION_TYPES; ++i)
     best_pred_rd[i] = INT64_MAX;
@@ -5237,87 +2505,61 @@
   for (i = 0; i < NB_TXFM_MODES; i++)
     best_txfm_rd[i] = INT64_MAX;
 
-  // Create a mask set to 1 for each frame used by a smaller resolution.p
-  if (cpi->Speed > 0) {
+  // Create a mask set to 1 for each frame used by a smaller resolution.
+  if (cpi->speed > 0) {
     switch (block_size) {
       case BLOCK_64X64:
         for (i = 0; i < 4; i++) {
           for (j = 0; j < 4; j++) {
-            ref_frame_mask |= (1 << x->mb_context[i][j].mic.mbmi.ref_frame);
-            mode_mask |= (1 << x->mb_context[i][j].mic.mbmi.mode);
+            ref_frame_mask |= x->mb_context[i][j].frames_with_high_error;
+            mode_mask |= x->mb_context[i][j].modes_with_high_error;
           }
         }
         for (i = 0; i < 4; i++) {
-          ref_frame_mask |= (1 << x->sb32_context[i].mic.mbmi.ref_frame);
-          mode_mask |= (1 << x->sb32_context[i].mic.mbmi.mode);
+          ref_frame_mask |= x->sb32_context[i].frames_with_high_error;
+          mode_mask |= x->sb32_context[i].modes_with_high_error;
         }
         break;
       case BLOCK_32X32:
         for (i = 0; i < 4; i++) {
-          ref_frame_mask |= (1
-              << x->mb_context[xd->sb_index][i].mic.mbmi.ref_frame);
-          mode_mask |= (1 << x->mb_context[xd->sb_index][i].mic.mbmi.mode);
+          ref_frame_mask |=
+              x->mb_context[xd->sb_index][i].frames_with_high_error;
+          mode_mask |= x->mb_context[xd->sb_index][i].modes_with_high_error;
         }
         break;
+      default:
+        // Until we handle all block sizes set it to present;
+        ref_frame_mask = 0;
+        mode_mask = 0;
+        break;
     }
+    ref_frame_mask = ~ref_frame_mask;
+    mode_mask = ~mode_mask;
   }
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
-                         mb_row, mb_col, frame_mv[NEARESTMV], frame_mv[NEARMV],
-                         frame_mdcounts, yv12_mb, scale_factor);
+                         mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV],
+                         yv12_mb, scale_factor);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
-  // Disallow intra if none of the smaller prediction sizes used intra and
-  // speed > 0 ;
-  if (cpi->Speed == 0
-      || ( cpi->Speed > 0 && (ref_frame_mask & (1 << INTRA_FRAME)))) {
-    if (block_size == BLOCK_64X64) {
-      mbmi->mode = DC_PRED;
-      if (cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT) {
-        mbmi->txfm_size = TX_4X4;
-        rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_4x4, &rate_uv_tokenonly_4x4,
-                                  &dist_uv_4x4, &uv_skip_4x4);
-        mode_uv_4x4 = mbmi->uv_mode;
-      }
-      if (cm->txfm_mode != ONLY_4X4) {
-        mbmi->txfm_size = TX_8X8;
-        rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_8x8, &rate_uv_tokenonly_8x8,
-                                  &dist_uv_8x8, &uv_skip_8x8);
-        mode_uv_8x8 = mbmi->uv_mode;
-      }
-      if (cm->txfm_mode >= ALLOW_32X32) {
-        mbmi->txfm_size = TX_32X32;
-        rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_16x16,
-                                  &rate_uv_tokenonly_16x16, &dist_uv_16x16,
-                                  &uv_skip_16x16);
-        mode_uv_16x16 = mbmi->uv_mode;
-      }
-    } else {
-      assert(block_size == BLOCK_32X32);
-      mbmi->mode = DC_PRED;
-      if (cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT) {
-        mbmi->txfm_size = TX_4X4;
-        rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_4x4, &rate_uv_tokenonly_4x4,
-                                &dist_uv_4x4, &uv_skip_4x4);
-        mode_uv_4x4 = mbmi->uv_mode;
-      }
-      if (cm->txfm_mode != ONLY_4X4) {
-        mbmi->txfm_size = TX_8X8;
-        rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_8x8, &rate_uv_tokenonly_8x8,
-                                &dist_uv_8x8, &uv_skip_8x8);
-        mode_uv_8x8 = mbmi->uv_mode;
-      }
-      if (cm->txfm_mode >= ALLOW_32X32) {
-        mbmi->txfm_size = TX_32X32;
-        rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_16x16,
-                                &rate_uv_tokenonly_16x16, &dist_uv_16x16,
-                                &uv_skip_16x16);
-        mode_uv_16x16 = mbmi->uv_mode;
-      }
+  if (cpi->speed == 0
+      || (cpi->speed > 0 && (ref_frame_mask & (1 << INTRA_FRAME)))) {
+    mbmi->mode = DC_PRED;
+    mbmi->ref_frame[0] = INTRA_FRAME;
+    for (i = 0; i <= (bsize < BLOCK_SIZE_MB16X16 ? TX_4X4 :
+                      (bsize < BLOCK_SIZE_SB32X32 ? TX_8X8 :
+                       (bsize < BLOCK_SIZE_SB64X64 ? TX_16X16 : TX_32X32)));
+         i++) {
+      mbmi->txfm_size = i;
+      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[i], &rate_uv_tokenonly[i],
+                              &dist_uv[i], &skip_uv[i],
+                              (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :
+                                                           bsize);
+      mode_uv[i] = mbmi->uv_mode;
     }
   }
 
@@ -5325,33 +2567,39 @@
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
     int disable_skip = 0;
-    int other_cost = 0;
     int compmode_cost = 0;
     int rate2 = 0, rate_y = 0, rate_uv = 0;
     int distortion2 = 0, distortion_y = 0, distortion_uv = 0;
     int skippable;
     int64_t txfm_cache[NB_TXFM_MODES];
-#if CONFIG_COMP_INTERINTRA_PRED
-    int compmode_interintra_cost = 0;
-#endif
+    int i;
 
+    for (i = 0; i < NB_TXFM_MODES; ++i)
+      txfm_cache[i] = INT64_MAX;
+
     // Test best rd so far against threshold for trying this mode.
-    if (best_rd <= cpi->rd_threshes[mode_index] ||
-        cpi->rd_threshes[mode_index] == INT_MAX) {
+    if ((best_rd < ((cpi->rd_threshes[bsize][mode_index] *
+                     cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 4)) ||
+        cpi->rd_threshes[bsize][mode_index] == INT_MAX)
       continue;
-    }
 
+    // Do not allow compound prediction if the segment level reference
+    // frame feature is in use as in this case there can only be one reference.
+    if ((vp9_mode_order[mode_index].second_ref_frame > INTRA_FRAME) &&
+         vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME))
+      continue;
+
     x->skip = 0;
     this_mode = vp9_mode_order[mode_index].mode;
     ref_frame = vp9_mode_order[mode_index].ref_frame;
-    if (!(ref_frame == INTRA_FRAME
-        || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
-      continue;
-    }
-    if (cpi->Speed > 0) {
+
+    if (cpi->speed > 0 && bsize >= BLOCK_SIZE_SB8X8) {
       if (!(ref_frame_mask & (1 << ref_frame))) {
         continue;
       }
+      if (!(mode_mask & (1 << this_mode))) {
+        continue;
+      }
       if (vp9_mode_order[mode_index].second_ref_frame != NONE
           && !(ref_frame_mask
               & (1 << vp9_mode_order[mode_index].second_ref_frame))) {
@@ -5359,17 +2607,41 @@
       }
     }
 
-    mbmi->ref_frame = ref_frame;
-    mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
-    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
+    mbmi->ref_frame[0] = ref_frame;
+    mbmi->ref_frame[1] = vp9_mode_order[mode_index].second_ref_frame;
+
+    if (!(ref_frame == INTRA_FRAME
+        || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
+      continue;
+    }
+    if (!(mbmi->ref_frame[1] == NONE
+        || (cpi->ref_frame_flags & flag_list[mbmi->ref_frame[1]]))) {
+      continue;
+    }
+
+    // TODO(jingning, jkoleszar): scaling reference frame not supported for
+    // SPLITMV.
+    if (mbmi->ref_frame[0] > 0 &&
+          (scale_factor[mbmi->ref_frame[0]].x_scale_fp !=
+           (1 << VP9_REF_SCALE_SHIFT) ||
+           scale_factor[mbmi->ref_frame[0]].y_scale_fp !=
+           (1 << VP9_REF_SCALE_SHIFT)) &&
+        this_mode == SPLITMV)
+      continue;
+
+    if (mbmi->ref_frame[1] > 0 &&
+          (scale_factor[mbmi->ref_frame[1]].x_scale_fp !=
+           (1 << VP9_REF_SCALE_SHIFT) ||
+           scale_factor[mbmi->ref_frame[1]].y_scale_fp !=
+           (1 << VP9_REF_SCALE_SHIFT)) &&
+        this_mode == SPLITMV)
+      continue;
+
+    set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
                       scale_factor);
-    comp_pred = mbmi->second_ref_frame > INTRA_FRAME;
+    comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
     mbmi->mode = this_mode;
     mbmi->uv_mode = DC_PRED;
-#if CONFIG_COMP_INTERINTRA_PRED
-    mbmi->interintra_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-    mbmi->interintra_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
 
     // Evaluate all sub-pel filters irrespective of whether we can use
     // them for this frame.
@@ -5376,58 +2648,48 @@
     mbmi->interp_filter = cm->mcomp_filter_type;
     vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
 
-    // if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
-    //  continue;
-
-    if (this_mode == I8X8_PRED || this_mode == B_PRED || this_mode == SPLITMV)
+    if (bsize >= BLOCK_SIZE_SB8X8 &&
+        (this_mode == I4X4_PRED || this_mode == SPLITMV))
       continue;
-    //  if (vp9_mode_order[mode_index].second_ref_frame == INTRA_FRAME)
-    //  continue;
+    if (bsize < BLOCK_SIZE_SB8X8 &&
+        !(this_mode == I4X4_PRED || this_mode == SPLITMV))
+      continue;
 
     if (comp_pred) {
-      int second_ref;
-
-      if (ref_frame == ALTREF_FRAME) {
-        second_ref = LAST_FRAME;
-      } else {
-        second_ref = ref_frame + 1;
-      }
-      if (!(cpi->ref_frame_flags & flag_list[second_ref]))
+      if (!(cpi->ref_frame_flags & flag_list[mbmi->ref_frame[1]]))
         continue;
-      mbmi->second_ref_frame = second_ref;
-      set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
+      set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
                         scale_factor);
 
-      xd->second_pre = yv12_mb[second_ref];
       mode_excluded =
           mode_excluded ?
               mode_excluded : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
     } else {
-      // mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
+      // mbmi->ref_frame[1] = vp9_mode_order[mode_index].ref_frame[1];
       if (ref_frame != INTRA_FRAME) {
-        if (mbmi->second_ref_frame != INTRA_FRAME)
+        if (mbmi->ref_frame[1] != INTRA_FRAME)
           mode_excluded =
               mode_excluded ?
                   mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;
-#if CONFIG_COMP_INTERINTRA_PRED
-        else
-          mode_excluded = mode_excluded ? mode_excluded : !cm->use_interintra;
-#endif
       }
     }
 
-    xd->pre = yv12_mb[ref_frame];
-    vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));
+    // Select predictors
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred)
+        xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+    }
 
     // If the segment reference frame feature is enabled....
     // then do nothing if the current ref frame is not allowed..
     if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-        !vp9_check_segref(xd, segment_id, ref_frame)) {
+        vp9_get_segdata(xd, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
       continue;
     // If the segment skip feature is enabled....
     // then do nothing if the current mode is not allowed..
     } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) &&
-               (this_mode != ZEROMV)) {
+               (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) {
       continue;
     // Disable this drop out case if the ref frame
     // segment level feature is enabled for this segment. This is to
@@ -5442,84 +2704,204 @@
         }
       }
     }
+    // TODO(JBB): This is to make up for the fact that we don't have sad
+    // functions that work when the block size reads outside the umv.  We
+    // should fix this either by making the motion search just work on
+    // a representative block in the boundary ( first ) and then implement a
+    // function that does sads when inside the border..
+    if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) &&
+        this_mode == NEWMV) {
+      continue;
+    }
 
-    if (ref_frame == INTRA_FRAME) {
-      if (block_size == BLOCK_64X64) {
-        vp9_build_intra_predictors_sb64y_s(xd);
-        super_block_64_yrd(cpi, x, &rate_y, &distortion_y,
-                           &skippable, txfm_cache);
-      } else {
-        assert(block_size == BLOCK_32X32);
-        vp9_build_intra_predictors_sby_s(xd);
-        super_block_yrd(cpi, x, &rate_y, &distortion_y,
-                        &skippable, txfm_cache);
-      }
-      if (mbmi->txfm_size == TX_4X4) {
-        rate_uv = rate_uv_4x4;
-        distortion_uv = dist_uv_4x4;
-        skippable = skippable && uv_skip_4x4;
-        mbmi->uv_mode = mode_uv_4x4;
-      } else if (mbmi->txfm_size == TX_32X32) {
-        rate_uv = rate_uv_16x16;
-        distortion_uv = dist_uv_16x16;
-        skippable = skippable && uv_skip_16x16;
-        mbmi->uv_mode = mode_uv_16x16;
-      } else {
-        rate_uv = rate_uv_8x8;
-        distortion_uv = dist_uv_8x8;
-        skippable = skippable && uv_skip_8x8;
-        mbmi->uv_mode = mode_uv_8x8;
-      }
+    if (this_mode == I4X4_PRED) {
+      int rate;
 
-      rate2 = rate_y + x->mbmode_cost[cm->frame_type][mbmi->mode] + rate_uv;
+      mbmi->txfm_size = TX_4X4;
+      rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
+                                &distortion_y, INT64_MAX);
+      rate2 += rate;
+      rate2 += intra_cost_penalty;
+      distortion2 += distortion_y;
+
+      rate2 += rate_uv_intra[TX_4X4];
+      rate_uv = rate_uv_intra[TX_4X4];
+      distortion2 += dist_uv[TX_4X4];
+      distortion_uv = dist_uv[TX_4X4];
+      mbmi->uv_mode = mode_uv[TX_4X4];
+      txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+      for (i = 0; i < NB_TXFM_MODES; ++i)
+        txfm_cache[i] = txfm_cache[ONLY_4X4];
+    } else if (ref_frame == INTRA_FRAME) {
+      TX_SIZE uv_tx;
+      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
+                      bsize, txfm_cache);
+
+      uv_tx = mbmi->txfm_size;
+      if (bsize < BLOCK_SIZE_MB16X16 && uv_tx == TX_8X8)
+        uv_tx = TX_4X4;
+      if (bsize < BLOCK_SIZE_SB32X32 && uv_tx == TX_16X16)
+        uv_tx = TX_8X8;
+      else if (bsize < BLOCK_SIZE_SB64X64 && uv_tx == TX_32X32)
+        uv_tx = TX_16X16;
+
+      rate_uv = rate_uv_intra[uv_tx];
+      distortion_uv = dist_uv[uv_tx];
+      skippable = skippable && skip_uv[uv_tx];
+      mbmi->uv_mode = mode_uv[uv_tx];
+
+      rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv;
+      if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
+        rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
-    } else {
-      YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
-      int fb;
+    } else if (this_mode == SPLITMV) {
+      const int is_comp_pred = mbmi->ref_frame[1] > 0;
+      int rate, distortion;
+      int64_t this_rd_thresh;
+      int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
+      int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
+      int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0;
+      int switchable_filter_index;
+      int_mv *second_ref = is_comp_pred ?
+          &mbmi->ref_mvs[mbmi->ref_frame[1]][0] : NULL;
+      union b_mode_info tmp_best_bmodes[16];
+      MB_MODE_INFO tmp_best_mbmode;
+      PARTITION_INFO tmp_best_partition;
+      int pred_exists = 0;
+      int uv_skippable;
 
-      if (mbmi->ref_frame == LAST_FRAME) {
-        fb = cpi->lst_fb_idx;
-      } else if (mbmi->ref_frame == GOLDEN_FRAME) {
-        fb = cpi->gld_fb_idx;
+      this_rd_thresh = (mbmi->ref_frame[0] == LAST_FRAME) ?
+          cpi->rd_threshes[bsize][THR_NEWMV] :
+          cpi->rd_threshes[bsize][THR_NEWA];
+      this_rd_thresh = (mbmi->ref_frame[0] == GOLDEN_FRAME) ?
+          cpi->rd_threshes[bsize][THR_NEWG] : this_rd_thresh;
+      xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+
+      for (switchable_filter_index = 0;
+           switchable_filter_index < VP9_SWITCHABLE_FILTERS;
+           ++switchable_filter_index) {
+        int newbest;
+        mbmi->interp_filter =
+        vp9_switchable_interp[switchable_filter_index];
+        vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+
+        tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
+                     &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
+                     second_ref, INT64_MAX,
+                     &rate, &rate_y, &distortion,
+                     &skippable,
+                     (int)this_rd_thresh, seg_mvs,
+                     mi_row, mi_col);
+        if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+          const int rs = get_switchable_rate(cm, x);
+          tmp_rd += RDCOST(x->rdmult, x->rddiv, rs, 0);
+        }
+        newbest = (tmp_rd < tmp_best_rd);
+        if (newbest) {
+          tmp_best_filter = mbmi->interp_filter;
+          tmp_best_rd = tmp_rd;
+        }
+        if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||
+            (mbmi->interp_filter == cm->mcomp_filter_type &&
+             cm->mcomp_filter_type != SWITCHABLE)) {
+              tmp_best_rdu = tmp_rd;
+              tmp_best_rate = rate;
+              tmp_best_ratey = rate_y;
+              tmp_best_distortion = distortion;
+              tmp_best_skippable = skippable;
+              tmp_best_mbmode = *mbmi;
+              tmp_best_partition = *x->partition_info;
+              for (i = 0; i < 4; i++)
+                tmp_best_bmodes[i] = xd->mode_info_context->bmi[i];
+              pred_exists = 1;
+            }
+      }  // switchable_filter_index loop
+
+      mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?
+                             tmp_best_filter : cm->mcomp_filter_type);
+      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+      if (!pred_exists) {
+        // Handles the special case when a filter that is not in the
+        // switchable list (bilinear, 6-tap) is indicated at the frame level
+        tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
+                     &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
+                     second_ref, INT64_MAX,
+                     &rate, &rate_y, &distortion,
+                     &skippable,
+                     (int)this_rd_thresh, seg_mvs,
+                     mi_row, mi_col);
       } else {
-        fb = cpi->alt_fb_idx;
+        if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+          int rs = get_switchable_rate(cm, x);
+          tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
+        }
+        tmp_rd = tmp_best_rdu;
+        rate = tmp_best_rate;
+        rate_y = tmp_best_ratey;
+        distortion = tmp_best_distortion;
+        skippable = tmp_best_skippable;
+        *mbmi = tmp_best_mbmode;
+        *x->partition_info = tmp_best_partition;
+        for (i = 0; i < 4; i++)
+          xd->mode_info_context->bmi[i] = tmp_best_bmodes[i];
       }
 
+      rate2 += rate;
+      distortion2 += distortion;
+
+      if (cpi->common.mcomp_filter_type == SWITCHABLE)
+        rate2 += get_switchable_rate(cm, x);
+
+      // If even the 'Y' rd value of split is higher than best so far
+      // then dont bother looking at UV
+      vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
+                                      BLOCK_SIZE_SB8X8);
+      vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);
+      super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
+                                &uv_skippable, BLOCK_SIZE_SB8X8, TX_4X4);
+      rate2 += rate_uv;
+      distortion2 += distortion_uv;
+      skippable = skippable && uv_skippable;
+
+      txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+      for (i = 0; i < NB_TXFM_MODES; ++i)
+        txfm_cache[i] = txfm_cache[ONLY_4X4];
+
+      if (!mode_excluded) {
+        if (is_comp_pred)
+          mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
+        else
+          mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
+      }
+
+      compmode_cost = vp9_cost_bit(comp_mode_p, is_comp_pred);
+    } else {
+      YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
+      int fb = get_ref_frame_idx(cpi, mbmi->ref_frame[0]);
       if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
-        scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+        scaled_ref_frame[0] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
 
-#if CONFIG_COMP_INTERINTRA_PRED
-      if (mbmi->second_ref_frame == INTRA_FRAME) {
-        if (best_intra16_mode == DC_PRED - 1) continue;
-        mbmi->interintra_mode = best_intra16_mode;
-#if SEPARATE_INTERINTRA_UV
-        mbmi->interintra_uv_mode = best_intra16_uv_mode;
-#else
-        mbmi->interintra_uv_mode = best_intra16_mode;
-#endif
+      if (comp_pred) {
+        fb = get_ref_frame_idx(cpi, mbmi->ref_frame[1]);
+        if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
+          scaled_ref_frame[1] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
       }
-#endif
-      this_rd = handle_inter_mode(cpi, x, block_size,
-                                  &saddone, near_sadidx, mdcounts, txfm_cache,
+
+      compmode_cost = vp9_cost_bit(comp_mode_p,
+                                   mbmi->ref_frame[1] > INTRA_FRAME);
+      this_rd = handle_inter_mode(cpi, x, bsize,
+                                  txfm_cache,
                                   &rate2, &distortion2, &skippable,
-                                  &compmode_cost,
-#if CONFIG_COMP_INTERINTRA_PRED
-                                  &compmode_interintra_cost,
-#endif
                                   &rate_y, &distortion_y,
                                   &rate_uv, &distortion_uv,
                                   &mode_excluded, &disable_skip,
-                                  mode_index, &tmp_best_filter, frame_mv,
-                                  scaled_ref_frame, mb_row, mb_col);
+                                  &tmp_best_filter, frame_mv[this_mode],
+                                  scaled_ref_frame, mi_row, mi_col,
+                                  single_newmv);
       if (this_rd == INT64_MAX)
         continue;
     }
 
-#if CONFIG_COMP_INTERINTRA_PRED
-    if (cpi->common.use_interintra) {
-      rate2 += compmode_interintra_cost;
-    }
-#endif
     if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
       rate2 += compmode_cost;
     }
@@ -5526,7 +2908,11 @@
 
     // Estimate the reference frame signaling cost and add it
     // to the rolling cost variable.
-    rate2 += ref_costs[xd->mode_info_context->mbmi.ref_frame];
+    if (mbmi->ref_frame[1] > INTRA_FRAME) {
+      rate2 += ref_costs_comp[mbmi->ref_frame[0]];
+    } else {
+      rate2 += ref_costs_single[mbmi->ref_frame[0]];
+    }
 
     if (!disable_skip) {
       // Test for the condition where skip block will be activated
@@ -5533,39 +2919,34 @@
       // because there are no non zero coefficients and make any
       // necessary adjustment for rate. Ignore if skip is coded at
       // segment level as the cost wont have been added in.
-      if (cpi->common.mb_no_coeff_skip) {
-        int mb_skip_allowed;
+      int mb_skip_allowed;
 
-        // Is Mb level skip allowed (i.e. not coded at segment level).
-        mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
+      // Is Mb level skip allowed (i.e. not coded at segment level).
+      mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
 
-        if (skippable) {
-          // Back out the coefficient coding costs
-          rate2 -= (rate_y + rate_uv);
-          // for best_yrd calculation
-          rate_uv = 0;
+      if (skippable && bsize >= BLOCK_SIZE_SB8X8) {
+        // Back out the coefficient coding costs
+        rate2 -= (rate_y + rate_uv);
+        // for best_yrd calculation
+        rate_uv = 0;
 
-          if (mb_skip_allowed) {
-            int prob_skip_cost;
+        if (mb_skip_allowed) {
+          int prob_skip_cost;
 
-            // Cost the skip mb case
-            vp9_prob skip_prob =
-              vp9_get_pred_prob(cm, xd, PRED_MBSKIP);
+          // Cost the skip mb case
+          vp9_prob skip_prob =
+            vp9_get_pred_prob(cm, xd, PRED_MBSKIP);
 
-            if (skip_prob) {
-              prob_skip_cost = vp9_cost_bit(skip_prob, 1);
-              rate2 += prob_skip_cost;
-              other_cost += prob_skip_cost;
-            }
+          if (skip_prob) {
+            prob_skip_cost = vp9_cost_bit(skip_prob, 1);
+            rate2 += prob_skip_cost;
           }
         }
+      } else if (mb_skip_allowed) {
         // Add in the cost of the no skip flag.
-        else if (mb_skip_allowed) {
-          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,
-                                                          PRED_MBSKIP), 0);
-          rate2 += prob_skip_cost;
-          other_cost += prob_skip_cost;
-        }
+        int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,
+                                                        PRED_MBSKIP), 0);
+        rate2 += prob_skip_cost;
       }
 
       // Calculate the final RD estimate for this mode.
@@ -5574,26 +2955,14 @@
 
 #if 0
     // Keep record of best intra distortion
-    if ((xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
+    if ((xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) &&
         (this_rd < best_intra_rd)) {
       best_intra_rd = this_rd;
       *returnintra = distortion2;
     }
 #endif
-#if CONFIG_COMP_INTERINTRA_PRED
-    if ((mbmi->ref_frame == INTRA_FRAME) &&
-        (this_mode <= TM_PRED) &&
-        (this_rd < best_intra16_rd)) {
-      best_intra16_rd = this_rd;
-      best_intra16_mode = this_mode;
-#if SEPARATE_INTERINTRA_UV
-      best_intra16_uv_mode = (mbmi->txfm_size != TX_4X4 ?
-                              mode_uv_8x8 : mode_uv_4x4);
-#endif
-    }
-#endif
 
-    if (!disable_skip && mbmi->ref_frame == INTRA_FRAME)
+    if (!disable_skip && mbmi->ref_frame[0] == INTRA_FRAME)
       for (i = 0; i < NB_PREDICTION_TYPES; ++i)
         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
 
@@ -5601,11 +2970,20 @@
       best_overall_rd = this_rd;
       best_filter = tmp_best_filter;
       best_mode = this_mode;
-#if CONFIG_COMP_INTERINTRA_PRED
-      is_best_interintra = (mbmi->second_ref_frame == INTRA_FRAME);
-#endif
     }
 
+    if (this_mode != I4X4_PRED && this_mode != SPLITMV) {
+      // Store the respective mode distortions for later use.
+      if (mode_distortions[this_mode] == -1
+          || distortion2 < mode_distortions[this_mode]) {
+        mode_distortions[this_mode] = distortion2;
+      }
+      if (frame_distortions[mbmi->ref_frame[0]] == -1
+          || distortion2 < frame_distortions[mbmi->ref_frame[0]]) {
+        frame_distortions[mbmi->ref_frame[0]] = distortion2;
+      }
+    }
+
     // Did this mode help.. i.e. is it the new best mode
     if (this_rd < best_rd || x->skip) {
       if (!mode_excluded) {
@@ -5612,16 +2990,20 @@
         // Note index of best mode so far
         best_mode_index = mode_index;
 
-        if (this_mode <= B_PRED) {
+        if (ref_frame == INTRA_FRAME) {
           /* required for left and above block mv */
           mbmi->mv[0].as_int = 0;
         }
 
-        other_cost += ref_costs[xd->mode_info_context->mbmi.ref_frame];
         *returnrate = rate2;
         *returndistortion = distortion2;
         best_rd = this_rd;
-        vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));
+        best_mbmode = *mbmi;
+        best_partition = *x->partition_info;
+
+        if (this_mode == I4X4_PRED || this_mode == SPLITMV)
+          for (i = 0; i < 4; i++)
+            best_bmodes[i] = xd->mode_info_context->bmi[i];
       }
 #if 0
       // Testing this mode gave rise to an improvement in best error score.
@@ -5649,7 +3031,7 @@
     }
 
     /* keep record of best compound/single-only prediction */
-    if (!disable_skip && mbmi->ref_frame != INTRA_FRAME) {
+    if (!disable_skip && mbmi->ref_frame[0] != INTRA_FRAME) {
       int single_rd, hybrid_rd, single_rate, hybrid_rate;
 
       if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
@@ -5663,10 +3045,10 @@
       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
 
-      if (mbmi->second_ref_frame <= INTRA_FRAME &&
+      if (mbmi->ref_frame[1] <= INTRA_FRAME &&
           single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
         best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
-      } else if (mbmi->second_ref_frame > INTRA_FRAME &&
+      } else if (mbmi->ref_frame[1] > INTRA_FRAME &&
                  single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
         best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
       }
@@ -5675,14 +3057,23 @@
     }
 
     /* keep record of best txfm size */
+    if (bsize < BLOCK_SIZE_SB32X32) {
+      if (bsize < BLOCK_SIZE_MB16X16) {
+        if (this_mode == SPLITMV || this_mode == I4X4_PRED)
+          txfm_cache[ALLOW_8X8] = txfm_cache[ONLY_4X4];
+        txfm_cache[ALLOW_16X16] = txfm_cache[ALLOW_8X8];
+      }
+      txfm_cache[ALLOW_32X32] = txfm_cache[ALLOW_16X16];
+    }
     if (!mode_excluded && this_rd != INT64_MAX) {
       for (i = 0; i < NB_TXFM_MODES; i++) {
-        int64_t adj_rd;
-        if (this_mode != B_PRED) {
+        int64_t adj_rd = INT64_MAX;
+        if (this_mode != I4X4_PRED) {
           adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->txfm_mode];
         } else {
           adj_rd = this_rd;
         }
+
         if (adj_rd < best_txfm_rd[i])
           best_txfm_rd[i] = adj_rd;
       }
@@ -5691,22 +3082,61 @@
     if (x->skip && !mode_excluded)
       break;
   }
+  // Flag all modes that have a distortion thats > 2x the best we found at
+  // this level.
+  for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
+    if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV)
+      continue;
 
+    if (mode_distortions[mode_index] > 2 * *returndistortion) {
+      ctx->modes_with_high_error |= (1 << mode_index);
+    }
+  }
+
+  // Flag all ref frames that have a distortion thats > 2x the best we found at
+  // this level.
+  for (ref_frame = INTRA_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (frame_distortions[ref_frame] > 2 * *returndistortion) {
+      ctx->frames_with_high_error |= (1 << ref_frame);
+    }
+  }
+
+  if (best_rd == INT64_MAX && bsize < BLOCK_SIZE_SB8X8) {
+    *returnrate = INT_MAX;
+    *returndistortion = INT_MAX;
+    return best_rd;
+  }
+
   assert((cm->mcomp_filter_type == SWITCHABLE) ||
          (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
-         (best_mbmode.mode <= B_PRED));
+         (best_mbmode.ref_frame[0] == INTRA_FRAME));
 
-#if CONFIG_COMP_INTERINTRA_PRED
-  ++cpi->interintra_select_count[is_best_interintra];
-  // if (is_best_interintra)  printf("best_interintra\n");
-#endif
-
   // Accumulate filter usage stats
   // TODO(agrange): Use RD criteria to select interpolation filter mode.
-  if ((best_mode >= NEARESTMV) && (best_mode <= SPLITMV))
+  if (is_inter_mode(best_mode))
     ++cpi->best_switchable_interp_count[vp9_switchable_interp_map[best_filter]];
 
-  // TODO(rbultje) integrate with RD thresholding
+  // Updating rd_thresh_freq_fact[] here means that the differnt
+  // partition/block sizes are handled independently based on the best
+  // choice for the current partition. It may well be better to keep a scaled
+  // best rd so far value and update rd_thresh_freq_fact based on the mode/size
+  // combination that wins out.
+  if (cpi->sf.adpative_rd_thresh) {
+    for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
+      if (mode_index == best_mode_index) {
+        cpi->rd_thresh_freq_fact[bsize][mode_index] = BASE_RD_THRESH_FREQ_FACT;
+      } else {
+        cpi->rd_thresh_freq_fact[bsize][mode_index] += MAX_RD_THRESH_FREQ_INC;
+        if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
+            (cpi->sf.adpative_rd_thresh * MAX_RD_THRESH_FREQ_FACT)) {
+          cpi->rd_thresh_freq_fact[bsize][mode_index] =
+            cpi->sf.adpative_rd_thresh * MAX_RD_THRESH_FREQ_FACT;
+        }
+      }
+    }
+  }
+
+  // TODO(rbultje) integrate with RD trd_thresh_freq_facthresholding
 #if 0
   // Reduce the activation RD thresholds for the best choice mode
   if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
@@ -5727,16 +3157,22 @@
   if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
       cpi->is_src_frame_alt_ref &&
       (cpi->oxcf.arnr_max_frames == 0) &&
-      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
+      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame[0] != ALTREF_FRAME)
+      && bsize >= BLOCK_SIZE_SB8X8) {
     mbmi->mode = ZEROMV;
-    mbmi->ref_frame = ALTREF_FRAME;
-    mbmi->second_ref_frame = INTRA_FRAME;
+    mbmi->ref_frame[0] = ALTREF_FRAME;
+    mbmi->ref_frame[1] = NONE;
     mbmi->mv[0].as_int = 0;
     mbmi->uv_mode = DC_PRED;
-    mbmi->mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
-    mbmi->partitioning = 0;
-    mbmi->txfm_size = cm->txfm_mode == TX_MODE_SELECT ?
-                      TX_32X32 : cm->txfm_mode;
+    mbmi->mb_skip_coeff = 1;
+    if (cm->txfm_mode == TX_MODE_SELECT) {
+      if (bsize >= BLOCK_SIZE_SB32X32)
+        mbmi->txfm_size = TX_32X32;
+      else if (bsize >= BLOCK_SIZE_MB16X16)
+        mbmi->txfm_size = TX_16X16;
+      else
+        mbmi->txfm_size = TX_8X8;
+    }
 
     vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
     vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));
@@ -5744,8 +3180,30 @@
   }
 
   // macroblock modes
-  vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
+  *mbmi = best_mbmode;
+  if (best_mbmode.ref_frame[0] == INTRA_FRAME &&
+      best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {
+    for (i = 0; i < 4; i++)
+      xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
+  }
 
+  if (best_mbmode.ref_frame[0] != INTRA_FRAME &&
+      best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {
+    for (i = 0; i < 4; i++)
+      xd->mode_info_context->bmi[i].as_mv[0].as_int =
+          best_bmodes[i].as_mv[0].as_int;
+
+    if (mbmi->ref_frame[1] > 0)
+      for (i = 0; i < 4; i++)
+        xd->mode_info_context->bmi[i].as_mv[1].as_int =
+            best_bmodes[i].as_mv[1].as_int;
+
+    *x->partition_info = best_partition;
+
+    mbmi->mv[0].as_int = x->partition_info->bmi[3].mv.as_int;
+    mbmi->mv[1].as_int = x->partition_info->bmi[3].second_mv.as_int;
+  }
+
   for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
     if (best_pred_rd[i] == INT64_MAX)
       best_pred_diff[i] = INT_MIN;
@@ -5765,72 +3223,14 @@
   }
 
  end:
-  set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
+  set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
                     scale_factor);
-  {
-    PICK_MODE_CONTEXT *p = (block_size == BLOCK_32X32) ?
-                            &x->sb32_context[xd->sb_index] :
-                            &x->sb64_context;
-    store_coding_context(x, p, best_mode_index, NULL,
-                         &mbmi->ref_mvs[mbmi->ref_frame][0],
-                         &mbmi->ref_mvs[mbmi->second_ref_frame < 0 ? 0 :
-                             mbmi->second_ref_frame][0],
-                         best_pred_diff, best_txfm_diff);
-  }
+  store_coding_context(x, ctx, best_mode_index,
+                       &best_partition,
+                       &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
+                       &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
+                                      mbmi->ref_frame[1]][0],
+                       best_pred_diff, best_txfm_diff);
 
   return best_rd;
-}
-
-int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
-                                    int mb_row, int mb_col,
-                                    int *returnrate,
-                                    int *returndistortion) {
-  return vp9_rd_pick_inter_mode_sb(cpi, x, mb_row, mb_col,
-                                   returnrate, returndistortion, BLOCK_32X32);
-}
-
-int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
-                                    int mb_row, int mb_col,
-                                    int *returnrate,
-                                    int *returndistortion) {
-  return vp9_rd_pick_inter_mode_sb(cpi, x, mb_row, mb_col,
-                                   returnrate, returndistortion, BLOCK_64X64);
-}
-
-void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
-                                    int mb_row, int mb_col,
-                                    int *totalrate, int *totaldist) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  int rate, distortion;
-  int64_t intra_error = 0;
-  unsigned char *segment_id = &mbmi->segment_id;
-
-  if (xd->segmentation_enabled)
-    x->encode_breakout = cpi->segment_encode_breakout[*segment_id];
-  else
-    x->encode_breakout = cpi->oxcf.encode_breakout;
-
-  // if (cpi->sf.RD)
-  // For now this codebase is limited to a single rd encode path
-  {
-    int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
-
-    rd_pick_inter_mode(cpi, x, mb_row, mb_col, &rate,
-                       &distortion, &intra_error);
-
-    /* restore cpi->zbin_mode_boost_enabled */
-    cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
-  }
-  // else
-  // The non rd encode path has been deleted from this code base
-  // to simplify development
-  //    vp9_pick_inter_mode
-
-  // Store metrics so they can be added in to totals if this mode is picked
-  x->mb_context[xd->sb_index][xd->mb_index].distortion  = distortion;
-  x->mb_context[xd->sb_index][xd->mb_index].intra_error = intra_error;
-
-  *totalrate = rate;
-  *totaldist = distortion;
 }
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -19,26 +19,14 @@
 
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
 
-void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                            int *r, int *d);
+void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+                               int *r, int *d, BLOCK_SIZE_TYPE bsize,
+                               PICK_MODE_CONTEXT *ctx);
 
-void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
-                                 int *r, int *d);
-
-void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
-                                 int *r, int *d);
-
-void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
-                                    int mb_row, int mb_col,
-                                    int *r, int *d);
-
-int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
-                                    int mb_row, int mb_col,
-                                    int *r, int *d);
-
-int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
-                                    int mb_row, int mb_col,
-                                    int *r, int *d);
+int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+                                  int mi_row, int mi_col,
+                                  int *r, int *d, BLOCK_SIZE_TYPE bsize,
+                                  PICK_MODE_CONTEXT *ctx);
 
 void vp9_init_me_luts();
 
--- a/vp9/encoder/vp9_sad_c.c
+++ b/vp9/encoder/vp9_sad_c.c
@@ -23,6 +23,52 @@
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 64);
 }
 
+unsigned int vp9_sad64x32_c(const uint8_t *src_ptr,
+                            int  src_stride,
+                            const uint8_t *ref_ptr,
+                            int  ref_stride,
+                            unsigned int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 32);
+}
+
+void vp9_sad64x32x4d_c(const uint8_t *src_ptr,
+                       int  src_stride,
+                       const uint8_t* const ref_ptr[],
+                       int  ref_stride,
+                       unsigned int *sad_array) {
+  sad_array[0] = vp9_sad64x32(src_ptr, src_stride,
+                              ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad64x32(src_ptr, src_stride,
+                              ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad64x32(src_ptr, src_stride,
+                              ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad64x32(src_ptr, src_stride,
+                              ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+unsigned int vp9_sad32x64_c(const uint8_t *src_ptr,
+                            int  src_stride,
+                            const uint8_t *ref_ptr,
+                            int  ref_stride,
+                            unsigned int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 64);
+}
+
+void vp9_sad32x64x4d_c(const uint8_t *src_ptr,
+                       int  src_stride,
+                       const uint8_t* const ref_ptr[],
+                       int  ref_stride,
+                       unsigned int *sad_array) {
+  sad_array[0] = vp9_sad32x64(src_ptr, src_stride,
+                              ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad32x64(src_ptr, src_stride,
+                              ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad32x64(src_ptr, src_stride,
+                              ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad32x64(src_ptr, src_stride,
+                              ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
 unsigned int vp9_sad32x32_c(const uint8_t *src_ptr,
                             int  src_stride,
                             const uint8_t *ref_ptr,
@@ -31,6 +77,52 @@
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);
 }
 
+unsigned int vp9_sad32x16_c(const uint8_t *src_ptr,
+                            int   src_stride,
+                            const uint8_t *ref_ptr,
+                            int   ref_stride,
+                            unsigned int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 16);
+}
+
+void vp9_sad32x16x4d_c(const uint8_t *src_ptr,
+                       int  src_stride,
+                       const uint8_t* const ref_ptr[],
+                       int  ref_stride,
+                       unsigned int *sad_array) {
+  sad_array[0] = vp9_sad32x16(src_ptr, src_stride,
+                              ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad32x16(src_ptr, src_stride,
+                              ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad32x16(src_ptr, src_stride,
+                              ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad32x16(src_ptr, src_stride,
+                              ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+unsigned int vp9_sad16x32_c(const uint8_t *src_ptr,
+                            int   src_stride,
+                            const uint8_t *ref_ptr,
+                            int   ref_stride,
+                            unsigned int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 32);
+}
+
+void vp9_sad16x32x4d_c(const uint8_t *src_ptr,
+                       int  src_stride,
+                       const uint8_t* const ref_ptr[],
+                       int  ref_stride,
+                       unsigned int *sad_array) {
+  sad_array[0] = vp9_sad16x32(src_ptr, src_stride,
+                              ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad16x32(src_ptr, src_stride,
+                              ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad16x32(src_ptr, src_stride,
+                              ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad16x32(src_ptr, src_stride,
+                              ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
 unsigned int vp9_sad16x16_c(const uint8_t *src_ptr,
                             int  src_stride,
                             const uint8_t *ref_ptr,
@@ -64,7 +156,22 @@
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);
 }
 
+unsigned int vp9_sad8x4_c(const uint8_t *src_ptr,
+                          int src_stride,
+                          const uint8_t *ref_ptr,
+                          int ref_stride,
+                          unsigned int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 4);
+}
 
+unsigned int vp9_sad4x8_c(const uint8_t *src_ptr,
+                          int src_stride,
+                          const uint8_t *ref_ptr,
+                          int ref_stride,
+                          unsigned int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 8);
+}
+
 unsigned int vp9_sad4x4_c(const uint8_t *src_ptr,
                           int  src_stride,
                           const uint8_t *ref_ptr,
@@ -469,6 +576,98 @@
                              ref_ptr[2], ref_stride, 0x7fffffff);
   sad_array[3] = vp9_sad8x16(src_ptr, src_stride,
                              ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp9_sad8x4x4d_c(const uint8_t *src_ptr,
+                     int  src_stride,
+                     const uint8_t* const ref_ptr[],
+                     int  ref_stride,
+                     unsigned int *sad_array) {
+  sad_array[0] = vp9_sad8x4(src_ptr, src_stride,
+                            ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad8x4(src_ptr, src_stride,
+                            ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad8x4(src_ptr, src_stride,
+                            ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad8x4(src_ptr, src_stride,
+                            ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp9_sad8x4x8_c(const uint8_t *src_ptr,
+                     int  src_stride,
+                     const uint8_t *ref_ptr,
+                     int  ref_stride,
+                     uint32_t *sad_array) {
+  sad_array[0] = vp9_sad8x4(src_ptr, src_stride,
+                             ref_ptr, ref_stride,
+                             0x7fffffff);
+  sad_array[1] = vp9_sad8x4(src_ptr, src_stride,
+                             ref_ptr + 1, ref_stride,
+                             0x7fffffff);
+  sad_array[2] = vp9_sad8x4(src_ptr, src_stride,
+                             ref_ptr + 2, ref_stride,
+                             0x7fffffff);
+  sad_array[3] = vp9_sad8x4(src_ptr, src_stride,
+                             ref_ptr + 3, ref_stride,
+                             0x7fffffff);
+  sad_array[4] = vp9_sad8x4(src_ptr, src_stride,
+                             ref_ptr + 4, ref_stride,
+                             0x7fffffff);
+  sad_array[5] = vp9_sad8x4(src_ptr, src_stride,
+                             ref_ptr + 5, ref_stride,
+                             0x7fffffff);
+  sad_array[6] = vp9_sad8x4(src_ptr, src_stride,
+                             ref_ptr + 6, ref_stride,
+                             0x7fffffff);
+  sad_array[7] = vp9_sad8x4(src_ptr, src_stride,
+                             ref_ptr + 7, ref_stride,
+                             0x7fffffff);
+}
+
+void vp9_sad4x8x4d_c(const uint8_t *src_ptr,
+                     int  src_stride,
+                     const uint8_t* const ref_ptr[],
+                     int  ref_stride,
+                     unsigned int *sad_array) {
+  sad_array[0] = vp9_sad4x8(src_ptr, src_stride,
+                            ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad4x8(src_ptr, src_stride,
+                            ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad4x8(src_ptr, src_stride,
+                            ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad4x8(src_ptr, src_stride,
+                            ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp9_sad4x8x8_c(const uint8_t *src_ptr,
+                     int  src_stride,
+                     const uint8_t *ref_ptr,
+                     int  ref_stride,
+                     uint32_t *sad_array) {
+  sad_array[0] = vp9_sad4x8(src_ptr, src_stride,
+                             ref_ptr, ref_stride,
+                             0x7fffffff);
+  sad_array[1] = vp9_sad4x8(src_ptr, src_stride,
+                             ref_ptr + 1, ref_stride,
+                             0x7fffffff);
+  sad_array[2] = vp9_sad4x8(src_ptr, src_stride,
+                             ref_ptr + 2, ref_stride,
+                             0x7fffffff);
+  sad_array[3] = vp9_sad4x8(src_ptr, src_stride,
+                             ref_ptr + 3, ref_stride,
+                             0x7fffffff);
+  sad_array[4] = vp9_sad4x8(src_ptr, src_stride,
+                             ref_ptr + 4, ref_stride,
+                             0x7fffffff);
+  sad_array[5] = vp9_sad4x8(src_ptr, src_stride,
+                             ref_ptr + 5, ref_stride,
+                             0x7fffffff);
+  sad_array[6] = vp9_sad4x8(src_ptr, src_stride,
+                             ref_ptr + 6, ref_stride,
+                             0x7fffffff);
+  sad_array[7] = vp9_sad4x8(src_ptr, src_stride,
+                             ref_ptr + 7, ref_stride,
+                             0x7fffffff);
 }
 
 void vp9_sad4x4x4d_c(const uint8_t *src_ptr,
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -15,54 +15,9 @@
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_tile_common.h"
 
-void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x) {
-  int mb_row, mb_col;
-
-  MODE_INFO *this_mb_mode_info = cm->mi;
-
-  x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
-
-  if ((cm->frame_type == KEY_FRAME) || (cpi->refresh_golden_frame)) {
-    // Reset Gf useage monitors
-    vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
-    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
-  } else {
-    // for each macroblock row in image
-    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
-      // for each macroblock col in image
-      for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-
-        // If using golden then set GF active flag if not already set.
-        // If using last frame 0,0 mode then leave flag as it is
-        // else if using non 0,0 motion or intra modes then clear
-        // flag if it is currently set
-        if ((this_mb_mode_info->mbmi.ref_frame == GOLDEN_FRAME) ||
-            (this_mb_mode_info->mbmi.ref_frame == ALTREF_FRAME)) {
-          if (*(x->gf_active_ptr) == 0) {
-            *(x->gf_active_ptr) = 1;
-            cpi->gf_active_count++;
-          }
-        } else if ((this_mb_mode_info->mbmi.mode != ZEROMV) &&
-                   *(x->gf_active_ptr)) {
-          *(x->gf_active_ptr) = 0;
-          cpi->gf_active_count--;
-        }
-
-        x->gf_active_ptr++;          // Step onto next entry
-        this_mb_mode_info++;         // skip to next mb
-
-      }
-
-      // this is to account for the border
-      this_mb_mode_info++;
-    }
-  }
-}
-
 void vp9_enable_segmentation(VP9_PTR ptr) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
+  VP9_COMP *cpi = (VP9_COMP *)ptr;
 
-  // Set the appropriate feature bit
   cpi->mb.e_mbd.segmentation_enabled = 1;
   cpi->mb.e_mbd.update_mb_segmentation_map = 1;
   cpi->mb.e_mbd.update_mb_segmentation_data = 1;
@@ -69,9 +24,7 @@
 }
 
 void vp9_disable_segmentation(VP9_PTR ptr) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-
-  // Clear the appropriate feature bit
+  VP9_COMP *cpi = (VP9_COMP *)ptr;
   cpi->mb.e_mbd.segmentation_enabled = 0;
 }
 
@@ -81,7 +34,7 @@
 
   // Copy in the new segmentation map
   vpx_memcpy(cpi->segmentation_map, segmentation_map,
-             (cpi->common.mb_rows * cpi->common.mb_cols));
+             (cpi->common.mi_rows * cpi->common.mi_cols));
 
   // Signal that the map should be updated.
   cpi->mb.e_mbd.update_mb_segmentation_map = 1;
@@ -104,104 +57,59 @@
 }
 
 // Based on set of segment counts calculate a probability tree
-static void calc_segtree_probs(MACROBLOCKD *xd,
-                               int *segcounts,
+static void calc_segtree_probs(MACROBLOCKD *xd, int *segcounts,
                                vp9_prob *segment_tree_probs) {
-  int count1, count2;
-
-  // Total count for all segments
-  count1 = segcounts[0] + segcounts[1];
-  count2 = segcounts[2] + segcounts[3];
-
   // Work out probabilities of each segment
-  segment_tree_probs[0] = get_binary_prob(count1, count2);
-  segment_tree_probs[1] = get_prob(segcounts[0], count1);
-  segment_tree_probs[2] = get_prob(segcounts[2], count2);
+  const int c01 = segcounts[0] + segcounts[1];
+  const int c23 = segcounts[2] + segcounts[3];
+  const int c45 = segcounts[4] + segcounts[5];
+  const int c67 = segcounts[6] + segcounts[7];
+
+  segment_tree_probs[0] = get_binary_prob(c01 + c23, c45 + c67);
+  segment_tree_probs[1] = get_binary_prob(c01, c23);
+  segment_tree_probs[2] = get_binary_prob(c45, c67);
+  segment_tree_probs[3] = get_binary_prob(segcounts[0], segcounts[1]);
+  segment_tree_probs[4] = get_binary_prob(segcounts[2], segcounts[3]);
+  segment_tree_probs[5] = get_binary_prob(segcounts[4], segcounts[5]);
+  segment_tree_probs[6] = get_binary_prob(segcounts[6], segcounts[7]);
 }
 
 // Based on set of segment counts and probabilities calculate a cost estimate
-static int cost_segmap(MACROBLOCKD *xd,
-                       int *segcounts,
-                       vp9_prob *probs) {
-  int cost;
-  int count1, count2;
+static int cost_segmap(MACROBLOCKD *xd, int *segcounts, vp9_prob *probs) {
+  const int c01 = segcounts[0] + segcounts[1];
+  const int c23 = segcounts[2] + segcounts[3];
+  const int c45 = segcounts[4] + segcounts[5];
+  const int c67 = segcounts[6] + segcounts[7];
+  const int c0123 = c01 + c23;
+  const int c4567 = c45 + c67;
 
   // Cost the top node of the tree
-  count1 = segcounts[0] + segcounts[1];
-  count2 = segcounts[2] + segcounts[3];
-  cost = count1 * vp9_cost_zero(probs[0]) +
-         count2 * vp9_cost_one(probs[0]);
+  int cost = c0123 * vp9_cost_zero(probs[0]) +
+             c4567 * vp9_cost_one(probs[0]);
 
-  // Now add the cost of each individual segment branch
-  if (count1 > 0)
-    cost += segcounts[0] * vp9_cost_zero(probs[1]) +
-            segcounts[1] * vp9_cost_one(probs[1]);
+  // Cost subsequent levels
+  if (c0123 > 0) {
+    cost += c01 * vp9_cost_zero(probs[1]) +
+            c23 * vp9_cost_one(probs[1]);
 
-  if (count2 > 0)
-    cost += segcounts[2] * vp9_cost_zero(probs[2]) +
-            segcounts[3] * vp9_cost_one(probs[2]);
+    if (c01 > 0)
+      cost += segcounts[0] * vp9_cost_zero(probs[3]) +
+              segcounts[1] * vp9_cost_one(probs[3]);
+    if (c23 > 0)
+      cost += segcounts[2] * vp9_cost_zero(probs[4]) +
+              segcounts[3] * vp9_cost_one(probs[4]);
+  }
 
-  return cost;
-}
+  if (c4567 > 0) {
+    cost += c45 * vp9_cost_zero(probs[2]) +
+            c67 * vp9_cost_one(probs[2]);
 
-// Based on set of segment counts calculate a probability tree
-static void calc_segtree_probs_pred(MACROBLOCKD *xd,
-                                    int (*segcounts)[MAX_MB_SEGMENTS],
-                                    vp9_prob *segment_tree_probs,
-                                    vp9_prob *mod_probs) {
-  int count[4];
-
-  assert(!segcounts[0][0] && !segcounts[1][1] &&
-         !segcounts[2][2] && !segcounts[3][3]);
-
-  // Total count for all segments
-  count[0] = segcounts[3][0] + segcounts[1][0] + segcounts[2][0];
-  count[1] = segcounts[2][1] + segcounts[0][1] + segcounts[3][1];
-  count[2] = segcounts[0][2] + segcounts[3][2] + segcounts[1][2];
-  count[3] = segcounts[1][3] + segcounts[2][3] + segcounts[0][3];
-
-  // Work out probabilities of each segment
-  segment_tree_probs[0] = get_binary_prob(count[0] + count[1],
-                                          count[2] + count[3]);
-  segment_tree_probs[1] = get_binary_prob(count[0], count[1]);
-  segment_tree_probs[2] = get_binary_prob(count[2], count[3]);
-
-  // now work out modified counts that the decoder would have
-  count[0] =        segment_tree_probs[0]  *        segment_tree_probs[1];
-  count[1] =        segment_tree_probs[0]  * (256 - segment_tree_probs[1]);
-  count[2] = (256 - segment_tree_probs[0]) *        segment_tree_probs[2];
-  count[3] = (256 - segment_tree_probs[0]) * (256 - segment_tree_probs[2]);
-
-  // Work out modified probabilties depending on what segment was predicted
-  mod_probs[0] = get_binary_prob(count[1], count[2] + count[3]);
-  mod_probs[1] = get_binary_prob(count[0], count[2] + count[3]);
-  mod_probs[2] = get_binary_prob(count[0] + count[1], count[3]);
-  mod_probs[3] = get_binary_prob(count[0] + count[1], count[2]);
-}
-
-// Based on set of segment counts and probabilities calculate a cost estimate
-static int cost_segmap_pred(MACROBLOCKD *xd,
-                            int (*segcounts)[MAX_MB_SEGMENTS],
-                            vp9_prob *probs, vp9_prob *mod_probs) {
-  int pred_seg, cost = 0;
-
-  for (pred_seg = 0; pred_seg < MAX_MB_SEGMENTS; pred_seg++) {
-    int count1, count2;
-
-    // Cost the top node of the tree
-    count1 = segcounts[pred_seg][0] + segcounts[pred_seg][1];
-    count2 = segcounts[pred_seg][2] + segcounts[pred_seg][3];
-    cost += count1 * vp9_cost_zero(mod_probs[pred_seg]) +
-            count2 * vp9_cost_one(mod_probs[pred_seg]);
-
-    // Now add the cost of each individual segment branch
-    if (pred_seg >= 2 && count1) {
-      cost += segcounts[pred_seg][0] * vp9_cost_zero(probs[1]) +
-              segcounts[pred_seg][1] * vp9_cost_one(probs[1]);
-    } else if (pred_seg < 2 && count2 > 0) {
-      cost += segcounts[pred_seg][2] * vp9_cost_zero(probs[2]) +
-              segcounts[pred_seg][3] * vp9_cost_one(probs[2]);
-    }
+    if (c45 > 0)
+      cost += segcounts[4] * vp9_cost_zero(probs[5]) +
+              segcounts[5] * vp9_cost_one(probs[5]);
+    if (c67 > 0)
+      cost += segcounts[6] * vp9_cost_zero(probs[6]) +
+              segcounts[7] * vp9_cost_one(probs[6]);
   }
 
   return cost;
@@ -211,16 +119,18 @@
                        MODE_INFO *mi,
                        int *no_pred_segcounts,
                        int (*temporal_predictor_count)[2],
-                       int (*t_unpred_seg_counts)[MAX_MB_SEGMENTS],
-                       int mb_size, int mb_row, int mb_col) {
+                       int *t_unpred_seg_counts,
+                       int bw, int bh, int mi_row, int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  const int segmap_index = mb_row * cm->mb_cols + mb_col;
-  const int segment_id = mi->mbmi.segment_id;
+  int segment_id;
 
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  segment_id = mi->mbmi.segment_id;
   xd->mode_info_context = mi;
-  set_mb_row(cm, xd, mb_row, mb_size);
-  set_mb_col(cm, xd, mb_col, mb_size);
+  set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
 
   // Count the number of hits on each segment with no prediction
   no_pred_segcounts[segment_id]++;
@@ -228,7 +138,8 @@
   // Temporal prediction not allowed on key frames
   if (cm->frame_type != KEY_FRAME) {
     // Test to see if the segment id matches the predicted value.
-    const int pred_seg_id = vp9_get_pred_mb_segid(cm, xd, segmap_index);
+    const int pred_seg_id = vp9_get_pred_mi_segid(cm, mi->mbmi.sb_type,
+                                                  mi_row, mi_col);
     const int seg_predicted = (segment_id == pred_seg_id);
 
     // Get the segment id prediction context
@@ -241,10 +152,65 @@
 
     if (!seg_predicted)
       // Update the "unpredicted" segment count
-      t_unpred_seg_counts[pred_seg_id][segment_id]++;
+      t_unpred_seg_counts[segment_id]++;
   }
 }
 
+static void count_segs_sb(VP9_COMP *cpi, MODE_INFO *mi,
+                          int *no_pred_segcounts,
+                          int (*temporal_predictor_count)[2],
+                          int *t_unpred_seg_counts,
+                          int mi_row, int mi_col,
+                          BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int mis = cm->mode_info_stride;
+  int bwl, bhl;
+  const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  bwl = mi_width_log2(mi->mbmi.sb_type);
+  bhl = mi_height_log2(mi->mbmi.sb_type);
+
+  if (bwl == bsl && bhl == bsl) {
+    count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
+               t_unpred_seg_counts, 1 << bsl, 1 << bsl, mi_row, mi_col);
+  } else if (bwl == bsl && bhl < bsl) {
+    count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
+               t_unpred_seg_counts, 1 << bsl, bs, mi_row, mi_col);
+    count_segs(cpi, mi + bs * mis, no_pred_segcounts, temporal_predictor_count,
+               t_unpred_seg_counts, 1 << bsl, bs, mi_row + bs, mi_col);
+  } else if (bwl < bsl && bhl == bsl) {
+    count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
+               t_unpred_seg_counts, bs, 1 << bsl, mi_row, mi_col);
+    count_segs(cpi, mi + bs, no_pred_segcounts, temporal_predictor_count,
+               t_unpred_seg_counts, bs, 1 << bsl, mi_row, mi_col + bs);
+  } else {
+    BLOCK_SIZE_TYPE subsize;
+    int n;
+
+    assert(bwl < bsl && bhl < bsl);
+    if (bsize == BLOCK_SIZE_SB64X64) {
+      subsize = BLOCK_SIZE_SB32X32;
+    } else if (bsize == BLOCK_SIZE_SB32X32) {
+      subsize = BLOCK_SIZE_MB16X16;
+    } else {
+      assert(bsize == BLOCK_SIZE_MB16X16);
+      subsize = BLOCK_SIZE_SB8X8;
+    }
+
+    for (n = 0; n < 4; n++) {
+      const int y_idx = n >> 1, x_idx = n & 0x01;
+
+      count_segs_sb(cpi, mi + y_idx * bs * mis + x_idx * bs,
+                    no_pred_segcounts, temporal_predictor_count,
+                    t_unpred_seg_counts,
+                    mi_row + y_idx * bs, mi_col + x_idx * bs, subsize);
+    }
+  }
+}
+
 void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
@@ -253,15 +219,14 @@
   int t_pred_cost = INT_MAX;
 
   int i;
-  int tile_col, mb_row, mb_col;
+  int tile_col, mi_row, mi_col;
 
   int temporal_predictor_count[PREDICTION_PROBS][2];
   int no_pred_segcounts[MAX_MB_SEGMENTS];
-  int t_unpred_seg_counts[MAX_MB_SEGMENTS][MAX_MB_SEGMENTS];
+  int t_unpred_seg_counts[MAX_MB_SEGMENTS];
 
-  vp9_prob no_pred_tree[MB_FEATURE_TREE_PROBS];
-  vp9_prob t_pred_tree[MB_FEATURE_TREE_PROBS];
-  vp9_prob t_pred_tree_mod[MAX_MB_SEGMENTS];
+  vp9_prob no_pred_tree[MB_SEG_TREE_PROBS];
+  vp9_prob t_pred_tree[MB_SEG_TREE_PROBS];
   vp9_prob t_nopred_prob[PREDICTION_PROBS];
 
   const int mis = cm->mode_info_stride;
@@ -269,10 +234,8 @@
 
   // Set default state for the segment tree probabilities and the
   // temporal coding probabilities
-  vpx_memset(xd->mb_segment_tree_probs, 255,
-             sizeof(xd->mb_segment_tree_probs));
-  vpx_memset(cm->segment_pred_probs, 255,
-             sizeof(cm->segment_pred_probs));
+  vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
+  vpx_memset(cm->segment_pred_probs, 255, sizeof(cm->segment_pred_probs));
 
   vpx_memset(no_pred_segcounts, 0, sizeof(no_pred_segcounts));
   vpx_memset(t_unpred_seg_counts, 0, sizeof(t_unpred_seg_counts));
@@ -280,53 +243,17 @@
 
   // First of all generate stats regarding how well the last segment map
   // predicts this one
-
   for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {
     vp9_get_tile_col_offsets(cm, tile_col);
-    mi_ptr = cm->mi + cm->cur_tile_mb_col_start;
-    for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4, mi_ptr += 4 * mis) {
+    mi_ptr = cm->mi + cm->cur_tile_mi_col_start;
+    for (mi_row = 0; mi_row < cm->mi_rows;
+         mi_row += 8, mi_ptr += 8 * mis) {
       mi = mi_ptr;
-      for (mb_col = cm->cur_tile_mb_col_start;
-           mb_col < cm->cur_tile_mb_col_end; mb_col += 4, mi += 4) {
-        if (mi->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
-          count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
-                     t_unpred_seg_counts, 4, mb_row, mb_col);
-        } else {
-          for (i = 0; i < 4; i++) {
-            int x_idx = (i & 1) << 1, y_idx = i & 2;
-            MODE_INFO *sb_mi = mi + y_idx * mis + x_idx;
-
-            if (mb_col + x_idx >= cm->mb_cols ||
-                mb_row + y_idx >= cm->mb_rows) {
-              continue;
-            }
-
-            if (sb_mi->mbmi.sb_type) {
-              assert(sb_mi->mbmi.sb_type == BLOCK_SIZE_SB32X32);
-              count_segs(cpi, sb_mi, no_pred_segcounts,
-                         temporal_predictor_count, t_unpred_seg_counts, 2,
-                         mb_row + y_idx, mb_col + x_idx);
-            } else {
-              int j;
-
-              for (j = 0; j < 4; j++) {
-                const int x_idx_mb = x_idx + (j & 1);
-                const int y_idx_mb = y_idx + (j >> 1);
-                MODE_INFO *mb_mi = mi + x_idx_mb + y_idx_mb * mis;
-
-                if (mb_col + x_idx_mb >= cm->mb_cols ||
-                    mb_row + y_idx_mb >= cm->mb_rows) {
-                  continue;
-                }
-
-                assert(mb_mi->mbmi.sb_type == BLOCK_SIZE_MB16X16);
-                count_segs(cpi, mb_mi, no_pred_segcounts,
-                           temporal_predictor_count, t_unpred_seg_counts,
-                           1, mb_row + y_idx_mb, mb_col + x_idx_mb);
-              }
-            }
-          }
-        }
+      for (mi_col = cm->cur_tile_mi_col_start;
+           mi_col < cm->cur_tile_mi_col_end;
+           mi_col += 8, mi += 8) {
+        count_segs_sb(cpi, mi, no_pred_segcounts, temporal_predictor_count,
+                      t_unpred_seg_counts, mi_row, mi_col, BLOCK_SIZE_SB64X64);
       }
     }
   }
@@ -340,21 +267,19 @@
   if (cm->frame_type != KEY_FRAME) {
     // Work out probability tree for coding those segments not
     // predicted using the temporal method and the cost.
-    calc_segtree_probs_pred(xd, t_unpred_seg_counts, t_pred_tree,
-                            t_pred_tree_mod);
-    t_pred_cost = cost_segmap_pred(xd, t_unpred_seg_counts, t_pred_tree,
-                                   t_pred_tree_mod);
+    calc_segtree_probs(xd, t_unpred_seg_counts, t_pred_tree);
+    t_pred_cost = cost_segmap(xd, t_unpred_seg_counts, t_pred_tree);
 
     // Add in the cost of the signalling for each prediction context
     for (i = 0; i < PREDICTION_PROBS; i++) {
-      t_nopred_prob[i] = get_binary_prob(temporal_predictor_count[i][0],
-                                         temporal_predictor_count[i][1]);
+      const int count0 = temporal_predictor_count[i][0];
+      const int count1 = temporal_predictor_count[i][1];
 
+      t_nopred_prob[i] = get_binary_prob(count0, count1);
+
       // Add in the predictor signaling cost
-      t_pred_cost += (temporal_predictor_count[i][0] *
-                      vp9_cost_zero(t_nopred_prob[i])) +
-                     (temporal_predictor_count[i][1] *
-                      vp9_cost_one(t_nopred_prob[i]));
+      t_pred_cost += count0 * vp9_cost_zero(t_nopred_prob[i]) +
+                     count1 * vp9_cost_one(t_nopred_prob[i]);
     }
   }
 
@@ -361,15 +286,10 @@
   // Now choose which coding method to use.
   if (t_pred_cost < no_pred_cost) {
     cm->temporal_update = 1;
-    vpx_memcpy(xd->mb_segment_tree_probs,
-               t_pred_tree, sizeof(t_pred_tree));
-    vpx_memcpy(xd->mb_segment_mispred_tree_probs,
-               t_pred_tree_mod, sizeof(t_pred_tree_mod));
-    vpx_memcpy(&cm->segment_pred_probs,
-               t_nopred_prob, sizeof(t_nopred_prob));
+    vpx_memcpy(xd->mb_segment_tree_probs, t_pred_tree, sizeof(t_pred_tree));
+    vpx_memcpy(cm->segment_pred_probs, t_nopred_prob, sizeof(t_nopred_prob));
   } else {
     cm->temporal_update = 0;
-    vpx_memcpy(xd->mb_segment_tree_probs,
-               no_pred_tree, sizeof(no_pred_tree));
+    vpx_memcpy(xd->mb_segment_tree_probs, no_pred_tree, sizeof(no_pred_tree));
   }
 }
--- a/vp9/encoder/vp9_segmentation.h
+++ b/vp9/encoder/vp9_segmentation.h
@@ -15,8 +15,6 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/encoder/vp9_onyx_int.h"
 
-void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x);
-
 void vp9_enable_segmentation(VP9_PTR ptr);
 void vp9_disable_segmentation(VP9_PTR ptr);
 
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -26,7 +26,6 @@
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_swapyv12buffer.h"
 #include "vpx_ports/vpx_timer.h"
 
 #define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
@@ -41,22 +40,17 @@
                                             int mv_col,
                                             uint8_t *pred) {
   const int which_mv = 0;
-  int_mv subpel_mv;
-  int_mv fullpel_mv;
+  int_mv mv;
 
-  subpel_mv.as_mv.row = mv_row;
-  subpel_mv.as_mv.col = mv_col;
-  // TODO(jkoleszar): Make this rounding consistent with the rest of the code
-  fullpel_mv.as_mv.row = (mv_row >> 1) & ~7;
-  fullpel_mv.as_mv.col = (mv_col >> 1) & ~7;
+  mv.as_mv.row = mv_row;
+  mv.as_mv.col = mv_col;
 
   vp9_build_inter_predictor(y_mb_ptr, stride,
                             &pred[0], 16,
-                            &subpel_mv,
+                            &mv,
                             &xd->scale_factor[which_mv],
                             16, 16,
-                            which_mv <<
-                            (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT),
+                            which_mv,
                             &xd->subpix);
 
   stride = (stride + 1) >> 1;
@@ -63,20 +57,18 @@
 
   vp9_build_inter_predictor_q4(u_mb_ptr, stride,
                                &pred[256], 8,
-                               &fullpel_mv, &subpel_mv,
+                               &mv,
                                &xd->scale_factor_uv[which_mv],
                                8, 8,
-                               which_mv <<
-                               (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT),
+                               which_mv,
                                &xd->subpix);
 
   vp9_build_inter_predictor_q4(v_mb_ptr, stride,
                                &pred[320], 8,
-                               &fullpel_mv, &subpel_mv,
+                               &mv,
                                &xd->scale_factor_uv[which_mv],
                                8, 8,
-                               which_mv <<
-                               (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT),
+                               which_mv,
                                &xd->subpix);
 }
 
@@ -126,27 +118,23 @@
 #if ALT_REF_MC_ENABLED
 
 static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
-                                              YV12_BUFFER_CONFIG *arf_frame,
-                                              YV12_BUFFER_CONFIG *frame_ptr,
-                                              int mb_offset,
+                                              uint8_t *arf_frame_buf,
+                                              uint8_t *frame_ptr_buf,
+                                              int stride,
                                               int error_thresh) {
   MACROBLOCK *x = &cpi->mb;
+  MACROBLOCKD* const xd = &x->e_mbd;
   int step_param;
   int sadpb = x->sadperbit16;
   int bestsme = INT_MAX;
 
-  BLOCK *b = &x->block[0];
-  BLOCKD *d = &x->e_mbd.block[0];
   int_mv best_ref_mv1;
   int_mv best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+  int_mv *ref_mv;
 
   // Save input state
-  uint8_t **base_src = b->base_src;
-  int src = b->src;
-  int src_stride = b->src_stride;
-  uint8_t **base_pre = d->base_pre;
-  int pre = d->pre;
-  int pre_stride = d->pre_stride;
+  struct buf_2d src = x->plane[0].src;
+  struct buf_2d pre = xd->plane[0].pre[0];
 
   best_ref_mv1.as_int = 0;
   best_ref_mv1_full.as_mv.col = best_ref_mv1.as_mv.col >> 3;
@@ -153,26 +141,22 @@
   best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >> 3;
 
   // Setup frame pointers
-  b->base_src = &arf_frame->y_buffer;
-  b->src_stride = arf_frame->y_stride;
-  b->src = mb_offset;
+  x->plane[0].src.buf = arf_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = frame_ptr_buf;
+  xd->plane[0].pre[0].stride = stride;
 
-  d->base_pre = &frame_ptr->y_buffer;
-  d->pre_stride = frame_ptr->y_stride;
-  d->pre = mb_offset;
-
   // Further step/diamond searches as necessary
-  if (cpi->Speed < 8) {
-    step_param = cpi->sf.first_step +
-                 ((cpi->Speed > 5) ? 1 : 0);
-  } else {
+  if (cpi->speed < 8)
+    step_param = cpi->sf.first_step + ((cpi->speed > 5) ? 1 : 0);
+  else
     step_param = cpi->sf.first_step + 2;
-  }
 
   /*cpi->sf.search_method == HEX*/
   // TODO Check that the 16x16 vf & sdf are selected here
   // Ignore mv costing by sending NULL pointer instead of cost arrays
-  bestsme = vp9_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.as_mv[0],
+  ref_mv = &x->e_mbd.mode_info_context->bmi[0].as_mv[0];
+  bestsme = vp9_hex_search(x, &best_ref_mv1_full, ref_mv,
                            step_param, sadpb, &cpi->fn_ptr[BLOCK_16X16],
                            NULL, NULL, NULL, NULL,
                            &best_ref_mv1);
@@ -184,7 +168,7 @@
     int distortion;
     unsigned int sse;
     // Ignore mv costing by sending NULL pointer instead of cost array
-    bestsme = cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv[0],
+    bestsme = cpi->find_fractional_mv_step(x, ref_mv,
                                            &best_ref_mv1,
                                            x->errorperbit,
                                            &cpi->fn_ptr[BLOCK_16X16],
@@ -193,13 +177,9 @@
   }
 #endif
 
-  // Save input state
-  b->base_src = base_src;
-  b->src = src;
-  b->src_stride = src_stride;
-  d->base_pre = base_pre;
-  d->pre = pre;
-  d->pre_stride = pre_stride;
+  // Restore input state
+  x->plane[0].src = src;
+  xd->plane[0].pre[0] = pre;
 
   return bestsme;
 }
@@ -225,10 +205,12 @@
   DECLARE_ALIGNED_ARRAY(16, uint8_t,  predictor, 16 * 16 + 8 * 8 + 8 * 8);
 
   // Save input state
-  uint8_t *y_buffer = mbd->pre.y_buffer;
-  uint8_t *u_buffer = mbd->pre.u_buffer;
-  uint8_t *v_buffer = mbd->pre.v_buffer;
+  uint8_t* input_buffer[MAX_MB_PLANE];
+  int i;
 
+  for (i = 0; i < MAX_MB_PLANE; i++)
+    input_buffer[i] = mbd->plane[i].pre[0].buf;
+
   for (mb_row = 0; mb_row < mb_rows; mb_row++) {
 #if ALT_REF_MC_ENABLED
     // Source frames are extended to 16 pixels.  This is different than
@@ -264,8 +246,8 @@
         if (cpi->frames[frame] == NULL)
           continue;
 
-        mbd->block[0].bmi.as_mv[0].as_mv.row = 0;
-        mbd->block[0].bmi.as_mv[0].as_mv.col = 0;
+        mbd->mode_info_context->bmi[0].as_mv[0].as_mv.row = 0;
+        mbd->mode_info_context->bmi[0].as_mv[0].as_mv.col = 0;
 
         if (frame == alt_ref_index) {
           filter_weight = 2;
@@ -278,9 +260,9 @@
           // Find best match in this frame by MC
           err = temporal_filter_find_matching_mb_c
                 (cpi,
-                 cpi->frames[alt_ref_index],
-                 cpi->frames[frame],
-                 mb_y_offset,
+                 cpi->frames[alt_ref_index]->y_buffer + mb_y_offset,
+                 cpi->frames[frame]->y_buffer + mb_y_offset,
+                 cpi->frames[frame]->y_stride,
                  THRESH_LOW);
 #endif
           // Assign higher weight to matching MB if it's error
@@ -298,8 +280,8 @@
            cpi->frames[frame]->u_buffer + mb_uv_offset,
            cpi->frames[frame]->v_buffer + mb_uv_offset,
            cpi->frames[frame]->y_stride,
-           mbd->block[0].bmi.as_mv[0].as_mv.row,
-           mbd->block[0].bmi.as_mv[0].as_mv.col,
+           mbd->mode_info_context->bmi[0].as_mv[0].as_mv.row,
+           mbd->mode_info_context->bmi[0].as_mv[0].as_mv.col,
            predictor);
 
           // Apply the filter (YUV)
@@ -372,16 +354,15 @@
   }
 
   // Restore input state
-  mbd->pre.y_buffer = y_buffer;
-  mbd->pre.u_buffer = u_buffer;
-  mbd->pre.v_buffer = v_buffer;
+  for (i = 0; i < MAX_MB_PLANE; i++)
+    mbd->plane[i].pre[0].buf = input_buffer[i];
 }
 
 void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
+  VP9_COMMON *const cm = &cpi->common;
+
   int frame = 0;
 
-  int num_frames_backward = 0;
-  int num_frames_forward = 0;
   int frames_to_blur_backward = 0;
   int frames_to_blur_forward = 0;
   int frames_to_blur = 0;
@@ -391,15 +372,13 @@
   int blur_type = cpi->oxcf.arnr_type;
   int max_frames = cpi->active_arnr_frames;
 
-  num_frames_backward = distance;
-  num_frames_forward = vp9_lookahead_depth(cpi->lookahead)
-                       - (num_frames_backward + 1);
+  const int num_frames_backward = distance;
+  const int num_frames_forward = vp9_lookahead_depth(cpi->lookahead)
+                               - (num_frames_backward + 1);
 
   switch (blur_type) {
     case 1:
-      /////////////////////////////////////////
       // Backward Blur
-
       frames_to_blur_backward = num_frames_backward;
 
       if (frames_to_blur_backward >= max_frames)
@@ -409,7 +388,6 @@
       break;
 
     case 2:
-      /////////////////////////////////////////
       // Forward Blur
 
       frames_to_blur_forward = num_frames_forward;
@@ -422,7 +400,6 @@
 
     case 3:
     default:
-      /////////////////////////////////////////
       // Center Blur
       frames_to_blur_forward = num_frames_forward;
       frames_to_blur_backward = num_frames_backward;
@@ -462,23 +439,91 @@
 
   // Setup scaling factors. Scaling on each of the arnr frames is not supported
   vp9_setup_scale_factors_for_frame(&cpi->mb.e_mbd.scale_factor[0],
-      &cpi->common.yv12_fb[cpi->common.new_fb_idx],
-      cpi->common.width,
-      cpi->common.height);
+      cm->yv12_fb[cm->new_fb_idx].y_crop_width,
+      cm->yv12_fb[cm->new_fb_idx].y_crop_height,
+      cm->width, cm->height);
   cpi->mb.e_mbd.scale_factor_uv[0] = cpi->mb.e_mbd.scale_factor[0];
 
   // Setup frame pointers, NULL indicates frame not included in filter
   vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *));
   for (frame = 0; frame < frames_to_blur; frame++) {
-    int which_buffer =  start_frame - frame;
+    int which_buffer = start_frame - frame;
     struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead,
                                                      which_buffer);
     cpi->frames[frames_to_blur - 1 - frame] = &buf->img;
   }
 
-  temporal_filter_iterate_c(
-    cpi,
-    frames_to_blur,
-    frames_to_blur_backward,
-    strength);
+  temporal_filter_iterate_c(cpi, frames_to_blur, frames_to_blur_backward,
+                            strength);
+}
+
+void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame,
+                           const int group_boost) {
+  int half_gf_int;
+  int frames_after_arf;
+  int frames_bwd = cpi->oxcf.arnr_max_frames - 1;
+  int frames_fwd = cpi->oxcf.arnr_max_frames - 1;
+  int q;
+
+  // Define the arnr filter width for this group of frames:
+  // We only filter frames that lie within a distance of half
+  // the GF interval from the ARF frame. We also have to trap
+  // cases where the filter extends beyond the end of clip.
+  // Note: this_frame->frame has been updated in the loop
+  // so it now points at the ARF frame.
+  half_gf_int = cpi->baseline_gf_interval >> 1;
+  frames_after_arf = (int)(cpi->twopass.total_stats.count - this_frame - 1);
+
+  switch (cpi->oxcf.arnr_type) {
+    case 1:  // Backward filter
+      frames_fwd = 0;
+      if (frames_bwd > half_gf_int)
+        frames_bwd = half_gf_int;
+      break;
+
+    case 2:  // Forward filter
+      if (frames_fwd > half_gf_int)
+        frames_fwd = half_gf_int;
+      if (frames_fwd > frames_after_arf)
+        frames_fwd = frames_after_arf;
+      frames_bwd = 0;
+      break;
+
+    case 3:  // Centered filter
+    default:
+      frames_fwd >>= 1;
+      if (frames_fwd > frames_after_arf)
+        frames_fwd = frames_after_arf;
+      if (frames_fwd > half_gf_int)
+        frames_fwd = half_gf_int;
+
+      frames_bwd = frames_fwd;
+
+      // For even length filter there is one more frame backward
+      // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
+      if (frames_bwd < half_gf_int)
+        frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1;
+      break;
+  }
+
+  cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
+
+  // Adjust the strength based on active max q
+  q = ((int)vp9_convert_qindex_to_q(cpi->active_worst_quality) >> 1);
+  if (q > 8) {
+    cpi->active_arnr_strength = cpi->oxcf.arnr_strength;
+  } else {
+    cpi->active_arnr_strength = cpi->oxcf.arnr_strength - (8 - q);
+    if (cpi->active_arnr_strength < 0)
+      cpi->active_arnr_strength = 0;
+  }
+
+  // Adjust number of frames in filter and strength based on gf boost level.
+  if (cpi->active_arnr_frames > (group_boost / 150)) {
+    cpi->active_arnr_frames = (group_boost / 150);
+    cpi->active_arnr_frames += !(cpi->active_arnr_frames & 1);
+  }
+  if (cpi->active_arnr_strength > (group_boost / 300)) {
+    cpi->active_arnr_strength = (group_boost / 300);
+  }
 }
--- a/vp9/encoder/vp9_temporal_filter.h
+++ b/vp9/encoder/vp9_temporal_filter.h
@@ -12,5 +12,7 @@
 #define VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
 
 void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance);
+void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame,
+                           const int group_boost);
 
 #endif  // VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -25,31 +25,12 @@
    compressions, then generating vp9_context.c = initial stats. */
 
 #ifdef ENTROPY_STATS
-vp9_coeff_accum context_counters_4x4[BLOCK_TYPES];
-vp9_coeff_accum context_counters_8x8[BLOCK_TYPES];
-vp9_coeff_accum context_counters_16x16[BLOCK_TYPES];
-vp9_coeff_accum context_counters_32x32[BLOCK_TYPES];
-
-extern vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES];
-extern vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES];
-extern vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES];
-extern vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES];
+vp9_coeff_accum context_counters[TX_SIZE_MAX_SB][BLOCK_TYPES];
+extern vp9_coeff_stats tree_update_hist[TX_SIZE_MAX_SB][BLOCK_TYPES];
 #endif  /* ENTROPY_STATS */
 
-#if CONFIG_CODE_NONZEROCOUNT
-#ifdef NZC_STATS
-unsigned int nzc_counts_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                           [NZC4X4_TOKENS];
-unsigned int nzc_counts_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                           [NZC8X8_TOKENS];
-unsigned int nzc_counts_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                             [NZC16X16_TOKENS];
-unsigned int nzc_counts_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]
-                             [NZC32X32_TOKENS];
-unsigned int nzc_pcat_counts[MAX_NZC_CONTEXTS][NZC_TOKENS_EXTRA]
-                            [NZC_BITS_EXTRA][2];
-#endif
-#endif
+DECLARE_ALIGNED(16, extern const uint8_t,
+                vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
 
 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
 const TOKENVALUE *vp9_dct_value_tokens_ptr;
@@ -59,7 +40,7 @@
 static void fill_value_tokens() {
 
   TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;
-  vp9_extra_bit_struct *const e = vp9_extra_bits;
+  vp9_extra_bit *const e = vp9_extra_bits;
 
   int i = -DCT_MAX_VALUE;
   int sign = 1;
@@ -77,25 +58,25 @@
 
         while (++j < 11  &&  e[j].base_val <= a) {}
 
-        t[i].Token = --j;
+        t[i].token = --j;
         eb |= (a - e[j].base_val) << 1;
       } else
-        t[i].Token = a;
+        t[i].token = a;
 
-      t[i].Extra = eb;
+      t[i].extra = eb;
     }
 
     // initialize the cost for extra bits for all possible coefficient value.
     {
       int cost = 0;
-      vp9_extra_bit_struct *p = vp9_extra_bits + t[i].Token;
+      vp9_extra_bit *p = vp9_extra_bits + t[i].token;
 
       if (p->base_val) {
-        const int extra = t[i].Extra;
-        const int Length = p->Len;
+        const int extra = t[i].extra;
+        const int length = p->len;
 
-        if (Length)
-          cost += treed_cost(p->tree, p->prob, extra >> 1, Length);
+        if (length)
+          cost += treed_cost(p->tree, p->prob, extra >> 1, length);
 
         cost += vp9_cost_bit(vp9_prob_half, extra & 1); /* sign */
         dct_value_cost[i + DCT_MAX_VALUE] = cost;
@@ -111,139 +92,99 @@
 
 extern const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad);
 
-static void tokenize_b(VP9_COMP *cpi,
-                       MACROBLOCKD *xd,
-                       const int ib,
-                       TOKENEXTRA **tp,
-                       PLANE_TYPE type,
-                       TX_SIZE tx_size,
-                       int dry_run) {
+struct tokenize_b_args {
+  VP9_COMP *cpi;
+  MACROBLOCKD *xd;
+  TOKENEXTRA **tp;
+  TX_SIZE tx_size;
+  int dry_run;
+};
+
+static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                       int ss_txfrm_size, void *arg) {
+  struct tokenize_b_args* const args = arg;
+  VP9_COMP *cpi = args->cpi;
+  MACROBLOCKD *xd = args->xd;
+  TOKENEXTRA **tp = args->tp;
+  PLANE_TYPE type = plane ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC;
+  TX_SIZE tx_size = ss_txfrm_size / 2;
+  int dry_run = args->dry_run;
+
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   int pt; /* near block/prev token context index */
-  int c = 0;
-  const int eob = xd->eobs[ib];     /* one beyond last nonzero coeff */
+  int c = 0, rc = 0;
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
-  int16_t *qcoeff_ptr = xd->qcoeff + 16 * ib;
+  const int eob = xd->plane[plane].eobs[block];
+  const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
+  const BLOCK_SIZE_TYPE sb_type = (mbmi->sb_type < BLOCK_SIZE_SB8X8) ?
+                                   BLOCK_SIZE_SB8X8 : mbmi->sb_type;
+  const int bwl = b_width_log2(sb_type);
+  const int off = block >> (2 * tx_size);
+  const int mod = bwl - tx_size - xd->plane[plane].subsampling_x;
+  const int aoff = (off & ((1 << mod) - 1)) << tx_size;
+  const int loff = (off >> mod) << tx_size;
+  ENTROPY_CONTEXT *A = xd->plane[plane].above_context + aoff;
+  ENTROPY_CONTEXT *L = xd->plane[plane].left_context + loff;
   int seg_eob, default_eob, pad;
   const int segment_id = mbmi->segment_id;
-  const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
   const int *scan, *nb;
   vp9_coeff_count *counts;
-  vp9_coeff_probs *probs;
-  const int ref = mbmi->ref_frame != INTRA_FRAME;
-  ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec;
+  vp9_coeff_probs_model *coef_probs;
+  const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
+  ENTROPY_CONTEXT above_ec, left_ec;
   uint8_t token_cache[1024];
-#if CONFIG_CODE_NONZEROCOUNT
-  int zerosleft, nzc = 0;
-  if (eob == 0)
-    assert(xd->nzcs[ib] == 0);
-#endif
+  TX_TYPE tx_type = DCT_DCT;
+  const uint8_t * band_translate;
+  assert((!type && !plane) || (type && plane));
 
-  if (sb_type == BLOCK_SIZE_SB64X64) {
-    a = (ENTROPY_CONTEXT *)xd->above_context +
-                                             vp9_block2above_sb64[tx_size][ib];
-    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb64[tx_size][ib];
-    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    a2 = a1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    l2 = l1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    a3 = a2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    l3 = l2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-  } else if (sb_type == BLOCK_SIZE_SB32X32) {
-    a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above_sb[tx_size][ib];
-    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb[tx_size][ib];
-    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    a2 = a3 = l2 = l3 = NULL;
-  } else {
-    a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above[tx_size][ib];
-    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left[tx_size][ib];
-    a1 = l1 = a2 = l2 = a3 = l3 = NULL;
-  }
-
+  counts = cpi->coef_counts[tx_size];
+  coef_probs = cpi->common.fc.coef_probs[tx_size];
   switch (tx_size) {
     default:
     case TX_4X4: {
-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                              get_tx_type_4x4(xd, ib) : DCT_DCT;
-      a_ec = *a;
-      l_ec = *l;
+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+          get_tx_type_4x4(xd, block) : DCT_DCT;
+      above_ec = A[0] != 0;
+      left_ec = L[0] != 0;
       seg_eob = 16;
-      scan = vp9_default_zig_zag1d_4x4;
-      if (tx_type != DCT_DCT) {
-        if (tx_type == ADST_DCT) {
-          scan = vp9_row_scan_4x4;
-        } else if (tx_type == DCT_ADST) {
-          scan = vp9_col_scan_4x4;
-        }
-      }
-      counts = cpi->coef_counts_4x4;
-      probs = cpi->common.fc.coef_probs_4x4;
+      scan = get_scan_4x4(tx_type);
+      band_translate = vp9_coefband_trans_4x4;
       break;
     }
     case TX_8X8: {
-      const int sz = 3 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x;
-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                              get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
-      a_ec = (a[0] + a[1]) != 0;
-      l_ec = (l[0] + l[1]) != 0;
+      const int sz = 1 + b_width_log2(sb_type);
+      const int x = block & ((1 << sz) - 1), y = block - x;
+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+          get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
+      above_ec = (A[0] + A[1]) != 0;
+      left_ec = (L[0] + L[1]) != 0;
       seg_eob = 64;
-      scan = vp9_default_zig_zag1d_8x8;
-      if (tx_type != DCT_DCT) {
-        if (tx_type == ADST_DCT) {
-          scan = vp9_row_scan_8x8;
-        } else if (tx_type == DCT_ADST) {
-          scan = vp9_col_scan_8x8;
-        }
-      }
-      counts = cpi->coef_counts_8x8;
-      probs = cpi->common.fc.coef_probs_8x8;
+      scan = get_scan_8x8(tx_type);
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_16X16: {
-      const int sz = 4 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x;
-      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-                              get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
-      if (type != PLANE_TYPE_UV) {
-        a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;
-        l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;
-      } else {
-        a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
-        l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
-      }
+      const int sz = 2 + b_width_log2(sb_type);
+      const int x = block & ((1 << sz) - 1), y = block - x;
+      tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+          get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
+      above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
+      left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
       seg_eob = 256;
-      scan = vp9_default_zig_zag1d_16x16;
-      if (tx_type != DCT_DCT) {
-        if (tx_type == ADST_DCT) {
-          scan = vp9_row_scan_16x16;
-        } else if (tx_type == DCT_ADST) {
-          scan = vp9_col_scan_16x16;
-        }
-      }
-      counts = cpi->coef_counts_16x16;
-      probs = cpi->common.fc.coef_probs_16x16;
+      scan = get_scan_16x16(tx_type);
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_32X32:
-      if (type != PLANE_TYPE_UV) {
-        a_ec = (a[0] + a[1] + a[2] + a[3] +
-                a1[0] + a1[1] + a1[2] + a1[3]) != 0;
-        l_ec = (l[0] + l[1] + l[2] + l[3] +
-                l1[0] + l1[1] + l1[2] + l1[3]) != 0;
-      } else {
-        a_ec = (a[0] + a[1] + a1[0] + a1[1] +
-                a2[0] + a2[1] + a3[0] + a3[1]) != 0;
-        l_ec = (l[0] + l[1] + l1[0] + l1[1] +
-                l2[0] + l2[1] + l3[0] + l3[1]) != 0;
-      }
+      above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
+      left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
       seg_eob = 1024;
-      scan = vp9_default_zig_zag1d_32x32;
-      counts = cpi->coef_counts_32x32;
-      probs = cpi->common.fc.coef_probs_32x32;
+      scan = vp9_default_scan_32x32;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
   }
 
-  VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);
+  pt = combine_entropy_contexts(above_ec, left_ec);
   nb = vp9_get_coef_neighbors_handle(scan, &pad);
   default_eob = seg_eob;
 
@@ -250,220 +191,94 @@
   if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))
     seg_eob = 0;
 
+  c = 0;
   do {
-    const int band = get_coef_band(scan, tx_size, c);
+    const int band = get_coef_band(band_translate, c);
     int token;
     int v = 0;
-#if CONFIG_CODE_NONZEROCOUNT
-    zerosleft = seg_eob - xd->nzcs[ib] - c + nzc;
-#endif
+    rc = scan[c];
+    if (c)
+      pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
     if (c < eob) {
-      const int rc = scan[c];
       v = qcoeff_ptr[rc];
       assert(-DCT_MAX_VALUE <= v  &&  v < DCT_MAX_VALUE);
 
-      t->Extra = vp9_dct_value_tokens_ptr[v].Extra;
-      token    = vp9_dct_value_tokens_ptr[v].Token;
+      t->extra = vp9_dct_value_tokens_ptr[v].extra;
+      token    = vp9_dct_value_tokens_ptr[v].token;
     } else {
-#if CONFIG_CODE_NONZEROCOUNT
-      break;
-#else
       token = DCT_EOB_TOKEN;
-#endif
     }
 
-    t->Token = token;
-    t->context_tree = probs[type][ref][band][pt];
-#if CONFIG_CODE_NONZEROCOUNT
-    // Skip zero node if there are no zeros left
-    t->skip_eob_node = 1 + (zerosleft == 0);
+    t->token = token;
+    t->context_tree = coef_probs[type][ref][band][pt];
+    t->skip_eob_node = (c > 0) && (token_cache[scan[c - 1]] == 0);
+
+#if CONFIG_BALANCED_COEFTREE
+    assert(token <= ZERO_TOKEN ||
+           vp9_coef_encodings[t->token].len - t->skip_eob_node > 0);
 #else
-    t->skip_eob_node = (c > 0) && (token_cache[c - 1] == 0);
+    assert(vp9_coef_encodings[t->token].len - t->skip_eob_node > 0);
 #endif
-    assert(vp9_coef_encodings[t->Token].Len - t->skip_eob_node > 0);
+
     if (!dry_run) {
       ++counts[type][ref][band][pt][token];
+#if CONFIG_BALANCED_COEFTREE
+      if (!t->skip_eob_node && token > ZERO_TOKEN)
+#else
       if (!t->skip_eob_node)
+#endif
         ++cpi->common.fc.eob_branch_counts[tx_size][type][ref][band][pt];
     }
-#if CONFIG_CODE_NONZEROCOUNT
-    nzc += (v != 0);
-#endif
-    token_cache[c] = token;
-
-    pt = vp9_get_coef_context(scan, nb, pad, token_cache, c + 1, default_eob);
+    token_cache[scan[c]] = vp9_pt_energy_class[token];
     ++t;
   } while (c < eob && ++c < seg_eob);
-#if CONFIG_CODE_NONZEROCOUNT
-  assert(nzc == xd->nzcs[ib]);
-#endif
 
   *tp = t;
-  a_ec = l_ec = (c > 0); /* 0 <-> all coeff data is zero */
-  a[0] = a_ec;
-  l[0] = l_ec;
-
-  if (tx_size == TX_8X8) {
-    a[1] = a_ec;
-    l[1] = l_ec;
-  } else if (tx_size == TX_16X16) {
-    if (type != PLANE_TYPE_UV) {
-      a[1] = a[2] = a[3] = a_ec;
-      l[1] = l[2] = l[3] = l_ec;
-    } else {
-      a1[0] = a1[1] = a[1] = a_ec;
-      l1[0] = l1[1] = l[1] = l_ec;
+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
+    set_contexts_on_border(xd, bsize, plane, tx_size, c, aoff, loff, A, L);
+  } else {
+    for (pt = 0; pt < (1 << tx_size); pt++) {
+      A[pt] = L[pt] = c > 0;
     }
-  } else if (tx_size == TX_32X32) {
-    if (type != PLANE_TYPE_UV) {
-      a[1] = a[2] = a[3] = a_ec;
-      l[1] = l[2] = l[3] = l_ec;
-      a1[0] = a1[1] = a1[2] = a1[3] = a_ec;
-      l1[0] = l1[1] = l1[2] = l1[3] = l_ec;
-    } else {
-      a[1] = a1[0] = a1[1] = a_ec;
-      l[1] = l1[0] = l1[1] = l_ec;
-      a2[0] = a2[1] = a3[0] = a3[1] = a_ec;
-      l2[0] = l2[1] = l3[0] = l3[1] = l_ec;
-    }
   }
 }
 
-int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd) {
-  int skip = 1;
-  int i = 0;
-
-  for (i = 0; i < 16; i++)
-    skip &= (!xd->eobs[i]);
-
-  return skip;
+struct is_skippable_args {
+  MACROBLOCKD *xd;
+  int *skippable;
+};
+static void is_skippable(int plane, int block,
+                         BLOCK_SIZE_TYPE bsize, int ss_txfrm_size, void *argv) {
+  struct is_skippable_args *args = argv;
+  args->skippable[0] &= (!args->xd->plane[plane].eobs[block]);
 }
 
-int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd) {
-  int skip = 1;
-  int i;
-
-  for (i = 16; i < 24; i++)
-    skip &= (!xd->eobs[i]);
-  return skip;
+int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
+  int result = 1;
+  struct is_skippable_args args = {xd, &result};
+  foreach_transformed_block(xd, bsize, is_skippable, &args);
+  return result;
 }
 
-static int mb_is_skippable_4x4(MACROBLOCKD *xd) {
-  return (vp9_mby_is_skippable_4x4(xd) &
-          vp9_mbuv_is_skippable_4x4(xd));
+int vp9_sby_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
+  int result = 1;
+  struct is_skippable_args args = {xd, &result};
+  foreach_transformed_block_in_plane(xd, bsize, 0,
+                                     is_skippable, &args);
+  return result;
 }
 
-int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd) {
-  int skip = 1;
-  int i = 0;
-
-  for (i = 0; i < 16; i += 4)
-    skip &= (!xd->eobs[i]);
-
-  return skip;
+int vp9_sbuv_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
+  int result = 1;
+  struct is_skippable_args args = {xd, &result};
+  foreach_transformed_block_uv(xd, bsize, is_skippable, &args);
+  return result;
 }
 
-int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd) {
-  return (!xd->eobs[16]) & (!xd->eobs[20]);
-}
-
-static int mb_is_skippable_8x8(MACROBLOCKD *xd) {
-  return (vp9_mby_is_skippable_8x8(xd) &
-          vp9_mbuv_is_skippable_8x8(xd));
-}
-
-static int mb_is_skippable_8x8_4x4uv(MACROBLOCKD *xd) {
-  return (vp9_mby_is_skippable_8x8(xd) &
-          vp9_mbuv_is_skippable_4x4(xd));
-}
-
-int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd) {
-  return (!xd->eobs[0]);
-}
-
-static int mb_is_skippable_16x16(MACROBLOCKD *xd) {
-  return (vp9_mby_is_skippable_16x16(xd) & vp9_mbuv_is_skippable_8x8(xd));
-}
-
-int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd) {
-  return (!xd->eobs[0]);
-}
-
-int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd) {
-  return (!xd->eobs[64]) & (!xd->eobs[80]);
-}
-
-static int sb_is_skippable_32x32(MACROBLOCKD *xd) {
-  return vp9_sby_is_skippable_32x32(xd) &&
-         vp9_sbuv_is_skippable_16x16(xd);
-}
-
-int vp9_sby_is_skippable_16x16(MACROBLOCKD *xd) {
-  int skip = 1;
-  int i = 0;
-
-  for (i = 0; i < 64; i += 16)
-    skip &= (!xd->eobs[i]);
-
-  return skip;
-}
-
-static int sb_is_skippable_16x16(MACROBLOCKD *xd) {
-  return vp9_sby_is_skippable_16x16(xd) & vp9_sbuv_is_skippable_16x16(xd);
-}
-
-int vp9_sby_is_skippable_8x8(MACROBLOCKD *xd) {
-  int skip = 1;
-  int i = 0;
-
-  for (i = 0; i < 64; i += 4)
-    skip &= (!xd->eobs[i]);
-
-  return skip;
-}
-
-int vp9_sbuv_is_skippable_8x8(MACROBLOCKD *xd) {
-  int skip = 1;
-  int i = 0;
-
-  for (i = 64; i < 96; i += 4)
-    skip &= (!xd->eobs[i]);
-
-  return skip;
-}
-
-static int sb_is_skippable_8x8(MACROBLOCKD *xd) {
-  return vp9_sby_is_skippable_8x8(xd) & vp9_sbuv_is_skippable_8x8(xd);
-}
-
-int vp9_sby_is_skippable_4x4(MACROBLOCKD *xd) {
-  int skip = 1;
-  int i = 0;
-
-  for (i = 0; i < 64; i++)
-    skip &= (!xd->eobs[i]);
-
-  return skip;
-}
-
-int vp9_sbuv_is_skippable_4x4(MACROBLOCKD *xd) {
-  int skip = 1;
-  int i = 0;
-
-  for (i = 64; i < 96; i++)
-    skip &= (!xd->eobs[i]);
-
-  return skip;
-}
-
-static int sb_is_skippable_4x4(MACROBLOCKD *xd) {
-  return vp9_sby_is_skippable_4x4(xd) & vp9_sbuv_is_skippable_4x4(xd);
-}
-
 void vp9_tokenize_sb(VP9_COMP *cpi,
                      MACROBLOCKD *xd,
                      TOKENEXTRA **t,
-                     int dry_run) {
+                     int dry_run, BLOCK_SIZE_TYPE bsize) {
   VP9_COMMON * const cm = &cpi->common;
   MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;
   TOKENEXTRA *t_backup = *t;
@@ -470,32 +285,17 @@
   const int mb_skip_context = vp9_get_pred_context(cm, xd, PRED_MBSKIP);
   const int segment_id = mbmi->segment_id;
   const int skip_inc = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
-  int b;
+  const TX_SIZE txfm_size = mbmi->txfm_size;
+  struct tokenize_b_args arg = {
+    cpi, xd, t, txfm_size, dry_run
+  };
 
-  switch (mbmi->txfm_size) {
-    case TX_32X32:
-      mbmi->mb_skip_coeff = sb_is_skippable_32x32(xd);
-      break;
-    case TX_16X16:
-      mbmi->mb_skip_coeff = sb_is_skippable_16x16(xd);
-      break;
-    case TX_8X8:
-      mbmi->mb_skip_coeff = sb_is_skippable_8x8(xd);
-      break;
-    case TX_4X4:
-      mbmi->mb_skip_coeff = sb_is_skippable_4x4(xd);
-      break;
-    default: assert(0);
-  }
+  mbmi->mb_skip_coeff = vp9_sb_is_skippable(xd, bsize);
 
   if (mbmi->mb_skip_coeff) {
     if (!dry_run)
-      cpi->skip_true_count[mb_skip_context] += skip_inc;
-    if (!cm->mb_no_coeff_skip) {
-      vp9_stuff_sb(cpi, xd, t, dry_run);
-    } else {
-      vp9_reset_sb_tokens_context(xd);
-    }
+      cm->fc.mbskip_count[mb_skip_context][1] += skip_inc;
+    vp9_reset_sb_tokens_context(xd, bsize);
     if (dry_run)
       *t = t_backup;
     return;
@@ -502,335 +302,29 @@
   }
 
   if (!dry_run)
-    cpi->skip_false_count[mb_skip_context] += skip_inc;
+    cm->fc.mbskip_count[mb_skip_context][0] += skip_inc;
 
-  switch (mbmi->txfm_size) {
-    case TX_32X32:
-      tokenize_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC,
-                 TX_32X32, dry_run);
-      for (b = 64; b < 96; b += 16)
-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,
-                   TX_16X16, dry_run);
-      break;
-    case TX_16X16:
-      for (b = 0; b < 64; b += 16)
-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,
-                   TX_16X16, dry_run);
-      for (b = 64; b < 96; b += 16)
-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,
-                   TX_16X16, dry_run);
-      break;
-    case TX_8X8:
-      for (b = 0; b < 64; b += 4)
-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,
-                   TX_8X8, dry_run);
-      for (b = 64; b < 96; b += 4)
-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,
-                   TX_8X8, dry_run);
-      break;
-    case TX_4X4:
-      for (b = 0; b < 64; b++)
-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,
-                   TX_4X4, dry_run);
-      for (b = 64; b < 96; b++)
-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,
-                   TX_4X4, dry_run);
-      break;
-    default: assert(0);
-  }
+  foreach_transformed_block(xd, bsize, tokenize_b, &arg);
 
   if (dry_run)
     *t = t_backup;
 }
 
-int vp9_sb64y_is_skippable_32x32(MACROBLOCKD *xd) {
-  int skip = 1;
-  int i = 0;
-
-  for (i = 0; i < 256; i += 64)
-    skip &= (!xd->eobs[i]);
-
-  return skip;
-}
-
-int vp9_sb64uv_is_skippable_32x32(MACROBLOCKD *xd) {
-  return (!xd->eobs[256]) & (!xd->eobs[320]);
-}
-
-static int sb64_is_skippable_32x32(MACROBLOCKD *xd) {
-  return vp9_sb64y_is_skippable_32x32(xd) & vp9_sb64uv_is_skippable_32x32(xd);
-}
-
-int vp9_sb64y_is_skippable_16x16(MACROBLOCKD *xd) {
-  int skip = 1;
-  int i = 0;
-
-  for (i = 0; i < 256; i += 16)
-    skip &= (!xd->eobs[i]);
-
-  return skip;
-}
-
-int vp9_sb64uv_is_skippable_16x16(MACROBLOCKD *xd) {
-  int skip = 1;
-  int i = 0;
-
-  for (i = 256; i < 384; i += 16)
-    skip &= (!xd->eobs[i]);
-
-  return skip;
-}
-
-static int sb64_is_skippable_16x16(MACROBLOCKD *xd) {
-  return vp9_sb64y_is_skippable_16x16(xd) & vp9_sb64uv_is_skippable_16x16(xd);
-}
-
-int vp9_sb64y_is_skippable_8x8(MACROBLOCKD *xd) {
-  int skip = 1;
-  int i = 0;
-
-  for (i = 0; i < 256; i += 4)
-    skip &= (!xd->eobs[i]);
-
-  return skip;
-}
-
-int vp9_sb64uv_is_skippable_8x8(MACROBLOCKD *xd) {
-  int skip = 1;
-  int i = 0;
-
-  for (i = 256; i < 384; i += 4)
-    skip &= (!xd->eobs[i]);
-
-  return skip;
-}
-
-static int sb64_is_skippable_8x8(MACROBLOCKD *xd) {
-  return vp9_sb64y_is_skippable_8x8(xd) & vp9_sb64uv_is_skippable_8x8(xd);
-}
-
-int vp9_sb64y_is_skippable_4x4(MACROBLOCKD *xd) {
-  int skip = 1;
-  int i = 0;
-
-  for (i = 0; i < 256; i++)
-    skip &= (!xd->eobs[i]);
-
-  return skip;
-}
-
-int vp9_sb64uv_is_skippable_4x4(MACROBLOCKD *xd) {
-  int skip = 1;
-  int i = 0;
-
-  for (i = 256; i < 384; i++)
-    skip &= (!xd->eobs[i]);
-
-  return skip;
-}
-
-static int sb64_is_skippable_4x4(MACROBLOCKD *xd) {
-  return vp9_sb64y_is_skippable_4x4(xd) & vp9_sb64uv_is_skippable_4x4(xd);
-}
-
-void vp9_tokenize_sb64(VP9_COMP *cpi,
-                       MACROBLOCKD *xd,
-                       TOKENEXTRA **t,
-                       int dry_run) {
-  VP9_COMMON * const cm = &cpi->common;
-  MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;
-  TOKENEXTRA *t_backup = *t;
-  const int mb_skip_context = vp9_get_pred_context(cm, xd, PRED_MBSKIP);
-  const int segment_id = mbmi->segment_id;
-  const int skip_inc = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
-  int b;
-
-  switch (mbmi->txfm_size) {
-    case TX_32X32:
-      mbmi->mb_skip_coeff = sb64_is_skippable_32x32(xd);
-      break;
-    case TX_16X16:
-      mbmi->mb_skip_coeff = sb64_is_skippable_16x16(xd);
-      break;
-    case TX_8X8:
-      mbmi->mb_skip_coeff = sb64_is_skippable_8x8(xd);
-      break;
-    case TX_4X4:
-      mbmi->mb_skip_coeff = sb64_is_skippable_4x4(xd);
-      break;
-    default: assert(0);
-  }
-
-  if (mbmi->mb_skip_coeff) {
-    if (!dry_run)
-      cpi->skip_true_count[mb_skip_context] += skip_inc;
-    if (!cm->mb_no_coeff_skip) {
-      vp9_stuff_sb64(cpi, xd, t, dry_run);
-    } else {
-      vp9_reset_sb64_tokens_context(xd);
-    }
-    if (dry_run)
-      *t = t_backup;
-    return;
-  }
-
-  if (!dry_run)
-    cpi->skip_false_count[mb_skip_context] += skip_inc;
-
-  switch (mbmi->txfm_size) {
-    case TX_32X32:
-      for (b = 0; b < 256; b += 64)
-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,
-                   TX_32X32, dry_run);
-      for (b = 256; b < 384; b += 64)
-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,
-                   TX_32X32, dry_run);
-      break;
-    case TX_16X16:
-      for (b = 0; b < 256; b += 16)
-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,
-                   TX_16X16, dry_run);
-      for (b = 256; b < 384; b += 16)
-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,
-                   TX_16X16, dry_run);
-      break;
-    case TX_8X8:
-      for (b = 0; b < 256; b += 4)
-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,
-                   TX_8X8, dry_run);
-      for (b = 256; b < 384; b += 4)
-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,
-                   TX_8X8, dry_run);
-      break;
-    case TX_4X4:
-      for (b = 0; b < 256; b++)
-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,
-                   TX_4X4, dry_run);
-      for (b = 256; b < 384; b++)
-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,
-                   TX_4X4, dry_run);
-      break;
-    default: assert(0);
-  }
-
-  if (dry_run)
-    *t = t_backup;
-}
-
-void vp9_tokenize_mb(VP9_COMP *cpi,
-                     MACROBLOCKD *xd,
-                     TOKENEXTRA **t,
-                     int dry_run) {
-  int b;
-  int tx_size = xd->mode_info_context->mbmi.txfm_size;
-  int mb_skip_context = vp9_get_pred_context(&cpi->common, xd, PRED_MBSKIP);
-  TOKENEXTRA *t_backup = *t;
-
-  // If the MB is going to be skipped because of a segment level flag
-  // exclude this from the skip count stats used to calculate the
-  // transmitted skip probability;
-  int skip_inc;
-  int segment_id = xd->mode_info_context->mbmi.segment_id;
-
-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
-    skip_inc = 1;
-  } else
-    skip_inc = 0;
-
-  switch (tx_size) {
-    case TX_16X16:
-
-      xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_16x16(xd);
-      break;
-    case TX_8X8:
-      if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
-          xd->mode_info_context->mbmi.mode == SPLITMV)
-        xd->mode_info_context->mbmi.mb_skip_coeff =
-            mb_is_skippable_8x8_4x4uv(xd);
-      else
-        xd->mode_info_context->mbmi.mb_skip_coeff =
-            mb_is_skippable_8x8(xd);
-      break;
-
-    default:
-      xd->mode_info_context->mbmi.mb_skip_coeff =
-          mb_is_skippable_4x4(xd);
-      break;
-  }
-
-  if (xd->mode_info_context->mbmi.mb_skip_coeff) {
-    if (!dry_run)
-      cpi->skip_true_count[mb_skip_context] += skip_inc;
-    if (!cpi->common.mb_no_coeff_skip) {
-      vp9_stuff_mb(cpi, xd, t, dry_run);
-    } else {
-      vp9_reset_mb_tokens_context(xd);
-    }
-
-    if (dry_run)
-      *t = t_backup;
-    return;
-  }
-
-  if (!dry_run)
-    cpi->skip_false_count[mb_skip_context] += skip_inc;
-
-  if (tx_size == TX_16X16) {
-    tokenize_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run);
-    for (b = 16; b < 24; b += 4) {
-      tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);
-    }
-  } else if (tx_size == TX_8X8) {
-    for (b = 0; b < 16; b += 4) {
-      tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);
-    }
-    if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
-        xd->mode_info_context->mbmi.mode == SPLITMV) {
-      for (b = 16; b < 24; b++) {
-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);
-      }
-    } else {
-      for (b = 16; b < 24; b += 4) {
-        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);
-      }
-    }
-  } else {
-    for (b = 0; b < 16; b++)
-      tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run);
-    for (b = 16; b < 24; b++)
-      tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);
-  }
-  if (dry_run)
-    *t = t_backup;
-}
-
 #ifdef ENTROPY_STATS
 void init_context_counters(void) {
   FILE *f = fopen("context.bin", "rb");
   if (!f) {
-    vpx_memset(context_counters_4x4, 0, sizeof(context_counters_4x4));
-    vpx_memset(context_counters_8x8, 0, sizeof(context_counters_8x8));
-    vpx_memset(context_counters_16x16, 0, sizeof(context_counters_16x16));
-    vpx_memset(context_counters_32x32, 0, sizeof(context_counters_32x32));
+    vp9_zero(context_counters);
   } else {
-    fread(context_counters_4x4, sizeof(context_counters_4x4), 1, f);
-    fread(context_counters_8x8, sizeof(context_counters_8x8), 1, f);
-    fread(context_counters_16x16, sizeof(context_counters_16x16), 1, f);
-    fread(context_counters_32x32, sizeof(context_counters_32x32), 1, f);
+    fread(context_counters, sizeof(context_counters), 1, f);
     fclose(f);
   }
 
   f = fopen("treeupdate.bin", "rb");
   if (!f) {
-    vpx_memset(tree_update_hist_4x4, 0, sizeof(tree_update_hist_4x4));
-    vpx_memset(tree_update_hist_8x8, 0, sizeof(tree_update_hist_8x8));
-    vpx_memset(tree_update_hist_16x16, 0, sizeof(tree_update_hist_16x16));
-    vpx_memset(tree_update_hist_32x32, 0, sizeof(tree_update_hist_32x32));
+    vpx_memset(tree_update_hist, 0, sizeof(tree_update_hist));
   } else {
-    fread(tree_update_hist_4x4, sizeof(tree_update_hist_4x4), 1, f);
-    fread(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);
-    fread(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);
-    fread(tree_update_hist_32x32, sizeof(tree_update_hist_32x32), 1, f);
+    fread(tree_update_hist, sizeof(tree_update_hist), 1, f);
     fclose(f);
   }
 }
@@ -932,32 +426,29 @@
   fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
 
   /* print counts */
-  print_counter(f, context_counters_4x4, BLOCK_TYPES,
+  print_counter(f, context_counters[TX_4X4], BLOCK_TYPES,
                 "vp9_default_coef_counts_4x4[BLOCK_TYPES]");
-  print_counter(f, context_counters_8x8, BLOCK_TYPES,
+  print_counter(f, context_counters[TX_8X8], BLOCK_TYPES,
                 "vp9_default_coef_counts_8x8[BLOCK_TYPES]");
-  print_counter(f, context_counters_16x16, BLOCK_TYPES,
+  print_counter(f, context_counters[TX_16X16], BLOCK_TYPES,
                 "vp9_default_coef_counts_16x16[BLOCK_TYPES]");
-  print_counter(f, context_counters_32x32, BLOCK_TYPES,
+  print_counter(f, context_counters[TX_32X32], BLOCK_TYPES,
                 "vp9_default_coef_counts_32x32[BLOCK_TYPES]");
 
   /* print coefficient probabilities */
-  print_probs(f, context_counters_4x4, BLOCK_TYPES,
+  print_probs(f, context_counters[TX_4X4], BLOCK_TYPES,
               "default_coef_probs_4x4[BLOCK_TYPES]");
-  print_probs(f, context_counters_8x8, BLOCK_TYPES,
+  print_probs(f, context_counters[TX_8X8], BLOCK_TYPES,
               "default_coef_probs_8x8[BLOCK_TYPES]");
-  print_probs(f, context_counters_16x16, BLOCK_TYPES,
+  print_probs(f, context_counters[TX_16X16], BLOCK_TYPES,
               "default_coef_probs_16x16[BLOCK_TYPES]");
-  print_probs(f, context_counters_32x32, BLOCK_TYPES,
+  print_probs(f, context_counters[TX_32X32], BLOCK_TYPES,
               "default_coef_probs_32x32[BLOCK_TYPES]");
 
   fclose(f);
 
   f = fopen("context.bin", "wb");
-  fwrite(context_counters_4x4, sizeof(context_counters_4x4), 1, f);
-  fwrite(context_counters_8x8, sizeof(context_counters_8x8), 1, f);
-  fwrite(context_counters_16x16, sizeof(context_counters_16x16), 1, f);
-  fwrite(context_counters_32x32, sizeof(context_counters_32x32), 1, f);
+  fwrite(context_counters, sizeof(context_counters), 1, f);
   fclose(f);
 }
 #endif
@@ -964,269 +455,4 @@
 
 void vp9_tokenize_initialize() {
   fill_value_tokens();
-}
-
-static void stuff_b(VP9_COMP *cpi,
-                    MACROBLOCKD *xd,
-                    const int ib,
-                    TOKENEXTRA **tp,
-                    PLANE_TYPE type,
-                    TX_SIZE tx_size,
-                    int dry_run) {
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
-#if CONFIG_CODE_NONZEROCOUNT == 0
-  vp9_coeff_count *counts;
-  vp9_coeff_probs *probs;
-  int pt, band;
-  TOKENEXTRA *t = *tp;
-  const int ref = mbmi->ref_frame != INTRA_FRAME;
-#endif
-  ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec;
-
-  if (sb_type == BLOCK_SIZE_SB32X32) {
-    a = (ENTROPY_CONTEXT *)xd->above_context +
-                                             vp9_block2above_sb64[tx_size][ib];
-    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb64[tx_size][ib];
-    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    a2 = a1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    l2 = l1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    a3 = a2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    l3 = l2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-  } else if (sb_type == BLOCK_SIZE_SB32X32) {
-    a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above_sb[tx_size][ib];
-    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb[tx_size][ib];
-    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);
-    a2 = l2 = a3 = l3 = NULL;
-  } else {
-    a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above[tx_size][ib];
-    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left[tx_size][ib];
-    a1 = l1 = a2 = l2 = a3 = l3 = NULL;
-  }
-
-  switch (tx_size) {
-    default:
-    case TX_4X4:
-      a_ec = a[0];
-      l_ec = l[0];
-#if CONFIG_CODE_NONZEROCOUNT == 0
-      counts = cpi->coef_counts_4x4;
-      probs = cpi->common.fc.coef_probs_4x4;
-#endif
-      break;
-    case TX_8X8:
-      a_ec = (a[0] + a[1]) != 0;
-      l_ec = (l[0] + l[1]) != 0;
-#if CONFIG_CODE_NONZEROCOUNT == 0
-      counts = cpi->coef_counts_8x8;
-      probs = cpi->common.fc.coef_probs_8x8;
-#endif
-      break;
-    case TX_16X16:
-      if (type != PLANE_TYPE_UV) {
-        a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;
-        l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;
-      } else {
-        a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
-        l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
-      }
-#if CONFIG_CODE_NONZEROCOUNT == 0
-      counts = cpi->coef_counts_16x16;
-      probs = cpi->common.fc.coef_probs_16x16;
-#endif
-      break;
-    case TX_32X32:
-      if (type != PLANE_TYPE_UV) {
-        a_ec = (a[0] + a[1] + a[2] + a[3] +
-                a1[0] + a1[1] + a1[2] + a1[3]) != 0;
-        l_ec = (l[0] + l[1] + l[2] + l[3] +
-                l1[0] + l1[1] + l1[2] + l1[3]) != 0;
-      } else {
-        a_ec = (a[0] + a[1] + a1[0] + a1[1] +
-                a2[0] + a2[1] + a3[0] + a3[1]) != 0;
-        l_ec = (l[0] + l[1] + l1[0] + l1[1] +
-                l2[0] + l2[1] + l3[0] + l3[1]) != 0;
-      }
-#if CONFIG_CODE_NONZEROCOUNT == 0
-      counts = cpi->coef_counts_32x32;
-      probs = cpi->common.fc.coef_probs_32x32;
-#endif
-      break;
-  }
-
-#if CONFIG_CODE_NONZEROCOUNT == 0
-  VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);
-  band = 0;
-  t->Token = DCT_EOB_TOKEN;
-  t->context_tree = probs[type][ref][band][pt];
-  t->skip_eob_node = 0;
-  ++t;
-  *tp = t;
-  if (!dry_run) {
-    ++counts[type][ref][band][pt][DCT_EOB_TOKEN];
-  }
-#endif
-  *a = *l = 0;
-  if (tx_size == TX_8X8) {
-    a[1] = 0;
-    l[1] = 0;
-  } else if (tx_size == TX_16X16) {
-    if (type != PLANE_TYPE_UV) {
-      a[1] = a[2] = a[3] = 0;
-      l[1] = l[2] = l[3] = 0;
-    } else {
-      a1[0] = a1[1] = a[1] = a_ec;
-      l1[0] = l1[1] = l[1] = l_ec;
-    }
-  } else if (tx_size == TX_32X32) {
-    if (type != PLANE_TYPE_Y_WITH_DC) {
-      a[1] = a[2] = a[3] = a_ec;
-      l[1] = l[2] = l[3] = l_ec;
-      a1[0] = a1[1] = a1[2] = a1[3] = a_ec;
-      l1[0] = l1[1] = l1[2] = l1[3] = l_ec;
-    } else {
-      a[1] = a1[0] = a1[1] = a_ec;
-      l[1] = l1[0] = l1[1] = l_ec;
-      a2[0] = a2[1] = a3[0] = a3[1] = a_ec;
-      l2[0] = l2[1] = l3[0] = l3[1] = l_ec;
-    }
-  }
-}
-
-static void stuff_mb_8x8(VP9_COMP *cpi, MACROBLOCKD *xd,
-                         TOKENEXTRA **t, int dry_run) {
-  int b;
-
-  for (b = 0; b < 16; b += 4)
-    stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);
-  for (b = 16; b < 24; b += 4)
-    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);
-}
-
-static void stuff_mb_16x16(VP9_COMP *cpi, MACROBLOCKD *xd,
-                           TOKENEXTRA **t, int dry_run) {
-  int b;
-  stuff_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run);
-
-  for (b = 16; b < 24; b += 4) {
-    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);
-  }
-}
-
-static void stuff_mb_4x4(VP9_COMP *cpi, MACROBLOCKD *xd,
-                         TOKENEXTRA **t, int dry_run) {
-  int b;
-
-  for (b = 0; b < 16; b++)
-    stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run);
-  for (b = 16; b < 24; b++)
-    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);
-}
-
-static void stuff_mb_8x8_4x4uv(VP9_COMP *cpi, MACROBLOCKD *xd,
-                               TOKENEXTRA **t, int dry_run) {
-  int b;
-
-  for (b = 0; b < 16; b += 4)
-    stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);
-  for (b = 16; b < 24; b++)
-    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);
-}
-
-void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {
-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
-  TOKENEXTRA * const t_backup = *t;
-
-  if (tx_size == TX_16X16) {
-    stuff_mb_16x16(cpi, xd, t, dry_run);
-  } else if (tx_size == TX_8X8) {
-    if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
-        xd->mode_info_context->mbmi.mode == SPLITMV) {
-      stuff_mb_8x8_4x4uv(cpi, xd, t, dry_run);
-    } else {
-      stuff_mb_8x8(cpi, xd, t, dry_run);
-    }
-  } else {
-    stuff_mb_4x4(cpi, xd, t, dry_run);
-  }
-
-  if (dry_run) {
-    *t = t_backup;
-  }
-}
-
-void vp9_stuff_sb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {
-  TOKENEXTRA * const t_backup = *t;
-  int b;
-
-  switch (xd->mode_info_context->mbmi.txfm_size) {
-    case TX_32X32:
-      stuff_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, TX_32X32, dry_run);
-      for (b = 64; b < 96; b += 16)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run);
-      break;
-    case TX_16X16:
-      for (b = 0; b < 64; b += 16)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run);
-      for (b = 64; b < 96; b += 16)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run);
-      break;
-    case TX_8X8:
-      for (b = 0; b < 64; b += 4)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);
-      for (b = 64; b < 96; b += 4)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);
-      break;
-    case TX_4X4:
-      for (b = 0; b < 64; b++)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run);
-      for (b = 64; b < 96; b++)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);
-      break;
-    default: assert(0);
-  }
-
-  if (dry_run) {
-    *t = t_backup;
-  }
-}
-
-void vp9_stuff_sb64(VP9_COMP *cpi, MACROBLOCKD *xd,
-                    TOKENEXTRA **t, int dry_run) {
-  TOKENEXTRA * const t_backup = *t;
-  int b;
-
-  switch (xd->mode_info_context->mbmi.txfm_size) {
-    case TX_32X32:
-      for (b = 0; b < 256; b += 64)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_32X32, dry_run);
-      for (b = 256; b < 384; b += 64)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_32X32, dry_run);
-      break;
-    case TX_16X16:
-      for (b = 0; b < 256; b += 16)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run);
-      for (b = 256; b < 384; b += 16)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run);
-      break;
-    case TX_8X8:
-      for (b = 0; b < 256; b += 4)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);
-      for (b = 256; b < 384; b += 4)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);
-      break;
-    case TX_4X4:
-      for (b = 0; b < 256; b++)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run);
-      for (b = 256; b < 384; b++)
-        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);
-      break;
-    default: assert(0);
-  }
-
-  if (dry_run) {
-    *t = t_backup;
-  }
 }
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -17,14 +17,14 @@
 void vp9_tokenize_initialize();
 
 typedef struct {
-  int16_t Token;
-  int16_t Extra;
+  int16_t token;
+  int16_t extra;
 } TOKENVALUE;
 
 typedef struct {
   const vp9_prob *context_tree;
-  int16_t         Extra;
-  uint8_t         Token;
+  int16_t         extra;
+  uint8_t         token;
   uint8_t         skip_eob_node;
 } TOKENEXTRA;
 
@@ -31,51 +31,19 @@
 typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
                                [MAX_ENTROPY_TOKENS + 1];
 
-int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd);
-int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd);
-int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd);
-int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd);
-int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd);
-int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd);
-int vp9_sby_is_skippable_16x16(MACROBLOCKD *xd);
-int vp9_sby_is_skippable_8x8(MACROBLOCKD *xd);
-int vp9_sby_is_skippable_4x4(MACROBLOCKD *xd);
-int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd);
-int vp9_sbuv_is_skippable_8x8(MACROBLOCKD *xd);
-int vp9_sbuv_is_skippable_4x4(MACROBLOCKD *xd);
-int vp9_sb64y_is_skippable_32x32(MACROBLOCKD *xd);
-int vp9_sb64y_is_skippable_16x16(MACROBLOCKD *xd);
-int vp9_sb64y_is_skippable_8x8(MACROBLOCKD *xd);
-int vp9_sb64y_is_skippable_4x4(MACROBLOCKD *xd);
-int vp9_sb64uv_is_skippable_32x32(MACROBLOCKD *xd);
-int vp9_sb64uv_is_skippable_16x16(MACROBLOCKD *xd);
-int vp9_sb64uv_is_skippable_8x8(MACROBLOCKD *xd);
-int vp9_sb64uv_is_skippable_4x4(MACROBLOCKD *xd);
-
+int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
+int vp9_sby_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
+int vp9_sbuv_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
 struct VP9_COMP;
 
-void vp9_tokenize_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
-                     TOKENEXTRA **t, int dry_run);
 void vp9_tokenize_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
-                     TOKENEXTRA **t, int dry_run);
-void vp9_tokenize_sb64(struct VP9_COMP *cpi, MACROBLOCKD *xd,
-                       TOKENEXTRA **t, int dry_run);
+                     TOKENEXTRA **t, int dry_run, BLOCK_SIZE_TYPE bsize);
 
-void vp9_stuff_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
-                  TOKENEXTRA **t, int dry_run);
-void vp9_stuff_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
-                  TOKENEXTRA **t, int dry_run);
-void vp9_stuff_sb64(struct VP9_COMP *cpi, MACROBLOCKD *xd,
-                    TOKENEXTRA **t, int dry_run);
-
 #ifdef ENTROPY_STATS
 void init_context_counters();
 void print_context_counters();
 
-extern vp9_coeff_accum context_counters_4x4[BLOCK_TYPES];
-extern vp9_coeff_accum context_counters_8x8[BLOCK_TYPES];
-extern vp9_coeff_accum context_counters_16x16[BLOCK_TYPES];
-extern vp9_coeff_accum context_counters_32x32[BLOCK_TYPES];
+extern vp9_coeff_accum context_counters[TX_SIZE_MAX_SB][BLOCK_TYPES];
 #endif
 
 extern const int *vp9_dct_value_cost_ptr;
--- a/vp9/encoder/vp9_treewriter.c
+++ b/vp9/encoder/vp9_treewriter.c
@@ -8,35 +8,31 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include "vp9/encoder/vp9_treewriter.h"
-#include "vp9/common/vp9_common.h"
 
-static void cost(
-  int *const C,
-  vp9_tree T,
-  const vp9_prob *const P,
-  int i,
-  int c
-) {
-  const vp9_prob p = P [i >> 1];
+static void cost(int *costs, vp9_tree tree, const vp9_prob *probs,
+                 int i, int c) {
+  const vp9_prob prob = probs[i / 2];
+  int b;
 
-  do {
-    const vp9_tree_index j = T[i];
-    const int d = c + vp9_cost_bit(p, i & 1);
+  for (b = 0; b <= 1; ++b) {
+    const int cc = c + vp9_cost_bit(prob, b);
+    const vp9_tree_index ii = tree[i + b];
 
-    if (j <= 0)
-      C[-j] = d;
+    if (ii <= 0)
+      costs[-ii] = cc;
     else
-      cost(C, T, P, j, d);
-  } while (++i & 1);
+      cost(costs, tree, probs, ii, cc);
+  }
 }
-void vp9_cost_tokens(int *c, const vp9_prob *p, vp9_tree t) {
-  cost(c, t, p, 0, 0);
+
+void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree) {
+  cost(costs, tree, probs, 0, 0);
 }
 
-void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t) {
-  assert(t[1] > 0 && t[0] <= 0);
-  c[-t[0]] = vp9_cost_bit(p[0], 0);
-  cost(c, t, p, 2, 0);
+void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree) {
+  assert(tree[0] <= 0 && tree[1] > 0);
+
+  costs[-tree[0]] = vp9_cost_bit(probs[0], 0);
+  cost(costs, tree, probs, 2, 0);
 }
--- a/vp9/encoder/vp9_treewriter.h
+++ b/vp9/encoder/vp9_treewriter.h
@@ -19,11 +19,8 @@
 
 #include "vp9/encoder/vp9_boolhuff.h"       /* for now */
 
-typedef BOOL_CODER vp9_writer;
 
-#define vp9_write encode_bool
-#define vp9_write_literal vp9_encode_value
-#define vp9_write_bit(W, V) vp9_write(W, V, vp9_prob_half)
+#define vp9_write_prob(w, v) vp9_write_literal((w), (v), 8)
 
 /* Approximate length of an encoded bool in 256ths of a bit at given prob */
 
@@ -38,69 +35,53 @@
 /* Both of these return bits, not scaled bits. */
 static INLINE unsigned int cost_branch256(const unsigned int ct[2],
                                           vp9_prob p) {
-  /* Imitate existing calculation */
   return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p);
 }
 
 static INLINE unsigned int cost_branch(const unsigned int ct[2],
                                        vp9_prob p) {
-  /* Imitate existing calculation */
   return cost_branch256(ct, p) >> 8;
 }
 
 
-/* Small functions to write explicit values and tokens, as well as
-   estimate their lengths. */
-
-static INLINE void treed_write(vp9_writer *const w,
-                               vp9_tree t,
-                               const vp9_prob *const p,
-                               int v,
-                               /* number of bits in v, assumed nonzero */
-                               int n) {
+static INLINE void treed_write(vp9_writer *w,
+                               vp9_tree tree, const vp9_prob *probs,
+                               int bits, int len) {
   vp9_tree_index i = 0;
 
   do {
-    const int b = (v >> --n) & 1;
-    vp9_write(w, b, p[i >> 1]);
-    i = t[i + b];
-  } while (n);
+    const int bit = (bits >> --len) & 1;
+    vp9_write(w, bit, probs[i >> 1]);
+    i = tree[i + bit];
+  } while (len);
 }
 
-static INLINE void write_token(vp9_writer *const w,
-                               vp9_tree t,
-                               const vp9_prob *const p,
-                               vp9_token *const x) {
-  treed_write(w, t, p, x->value, x->Len);
+static INLINE void write_token(vp9_writer *w, vp9_tree tree,
+                               const vp9_prob *probs,
+                               const struct vp9_token *token) {
+  treed_write(w, tree, probs, token->value, token->len);
 }
 
-static INLINE int treed_cost(vp9_tree t,
-                             const vp9_prob *const p,
-                             int v,
-                             /* number of bits in v, assumed nonzero */
-                             int n) {
-  int c = 0;
+static INLINE int treed_cost(vp9_tree tree, const vp9_prob *probs,
+                             int bits, int len) {
+  int cost = 0;
   vp9_tree_index i = 0;
 
   do {
-    const int b = (v >> --n) & 1;
-    c += vp9_cost_bit(p[i >> 1], b);
-    i = t[i + b];
-  } while (n);
+    const int bit = (bits >> --len) & 1;
+    cost += vp9_cost_bit(probs[i >> 1], bit);
+    i = tree[i + bit];
+  } while (len);
 
-  return c;
+  return cost;
 }
 
-static INLINE int cost_token(vp9_tree t,
-                             const vp9_prob *const p,
-                             vp9_token *const x) {
-  return treed_cost(t, p, x->value, x->Len);
+static INLINE int cost_token(vp9_tree tree, const vp9_prob *probs,
+                             const struct vp9_token *token) {
+  return treed_cost(tree, probs, token->value, token->len);
 }
 
-/* Fill array of costs for all possible token values. */
-
-void vp9_cost_tokens(int *Costs, const vp9_prob *, vp9_tree);
-
-void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t);
+void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree);
+void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree);
 
 #endif  // VP9_ENCODER_VP9_TREEWRITER_H_
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -12,6 +12,7 @@
 #define VP9_ENCODER_VP9_VARIANCE_H_
 
 #include "vpx/vpx_integer.h"
+// #include "./vpx_config.h"
 
 typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
                                     int source_stride,
@@ -50,6 +51,15 @@
                                                 int Refstride,
                                                 unsigned int *sse);
 
+typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
+                                                   int source_stride,
+                                                   int xoffset,
+                                                   int yoffset,
+                                                   const uint8_t *ref_ptr,
+                                                   int Refstride,
+                                                   unsigned int *sse,
+                                                   const uint8_t *second_pred);
+
 typedef void (*vp9_ssimpf_fn_t)(uint8_t *s, int sp, uint8_t *r,
                                 int rp, unsigned long *sum_s,
                                 unsigned long *sum_r, unsigned long *sum_sq_s,
@@ -64,15 +74,31 @@
                                                    int  ref_stride);
 
 typedef struct vp9_variance_vtable {
-    vp9_sad_fn_t            sdf;
-    vp9_variance_fn_t       vf;
-    vp9_subpixvariance_fn_t svf;
-    vp9_variance_fn_t       svf_halfpix_h;
-    vp9_variance_fn_t       svf_halfpix_v;
-    vp9_variance_fn_t       svf_halfpix_hv;
-    vp9_sad_multi_fn_t      sdx3f;
-    vp9_sad_multi1_fn_t     sdx8f;
-    vp9_sad_multi_d_fn_t    sdx4df;
+    vp9_sad_fn_t               sdf;
+    vp9_variance_fn_t          vf;
+    vp9_subpixvariance_fn_t    svf;
+    vp9_subp_avg_variance_fn_t svaf;
+    vp9_variance_fn_t          svf_halfpix_h;
+    vp9_variance_fn_t          svf_halfpix_v;
+    vp9_variance_fn_t          svf_halfpix_hv;
+    vp9_sad_multi_fn_t         sdx3f;
+    vp9_sad_multi1_fn_t        sdx8f;
+    vp9_sad_multi_d_fn_t       sdx4df;
 } vp9_variance_fn_ptr_t;
 
+static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int weight,
+                          int height, uint8_t *ref, int ref_stride) {
+  int i, j;
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < weight; j++) {
+      int tmp;
+      tmp = pred[j] + ref[j];
+      comp_pred[j] = (tmp + 1) >> 1;
+    }
+    comp_pred += weight;
+    pred += weight;
+    ref += ref_stride;
+  }
+}
 #endif  // VP9_ENCODER_VP9_VARIANCE_H_
--- a/vp9/encoder/vp9_variance_c.c
+++ b/vp9/encoder/vp9_variance_c.c
@@ -13,6 +13,7 @@
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_subpelvar.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
 
 unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
   unsigned int i, sum = 0;
@@ -24,6 +25,234 @@
   return sum;
 }
 
+unsigned int vp9_variance64x32_c(const uint8_t *src_ptr,
+                                 int  source_stride,
+                                 const uint8_t *ref_ptr,
+                                 int  recon_stride,
+                                 unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32, &var, &avg);
+  *sse = var;
+  return (var - (((int64_t)avg * avg) >> 11));
+}
+
+unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr,
+                                           int  src_pixels_per_line,
+                                           int  xoffset,
+                                           int  yoffset,
+                                           const uint8_t *dst_ptr,
+                                           int dst_pixels_per_line,
+                                           unsigned int *sse) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 64, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter);
+
+  return vp9_variance64x32_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 64, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter);
+  comp_avg_pred(temp3, second_pred, 64, 32, temp2, 64);
+  return vp9_variance64x32_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_variance32x64_c(const uint8_t *src_ptr,
+                                 int  source_stride,
+                                 const uint8_t *ref_ptr,
+                                 int  recon_stride,
+                                 unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64, &var, &avg);
+  *sse = var;
+  return (var - (((int64_t)avg * avg) >> 11));
+}
+
+unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr,
+                                           int  src_pixels_per_line,
+                                           int  xoffset,
+                                           int  yoffset,
+                                           const uint8_t *dst_ptr,
+                                           int dst_pixels_per_line,
+                                           unsigned int *sse) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 65, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter);
+
+  return vp9_variance32x64_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 65, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 64, temp2, 32);
+  return vp9_variance32x64_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_variance32x16_c(const uint8_t *src_ptr,
+                                 int  source_stride,
+                                 const uint8_t *ref_ptr,
+                                 int  recon_stride,
+                                 unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16, &var, &avg);
+  *sse = var;
+  return (var - (((int64_t)avg * avg) >> 9));
+}
+
+unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr,
+                                           int  src_pixels_per_line,
+                                           int  xoffset,
+                                           int  yoffset,
+                                           const uint8_t *dst_ptr,
+                                           int dst_pixels_per_line,
+                                           unsigned int *sse) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter);
+
+  return vp9_variance32x16_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 16, temp2, 32);
+  return vp9_variance32x16_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_variance16x32_c(const uint8_t *src_ptr,
+                                 int  source_stride,
+                                 const uint8_t *ref_ptr,
+                                 int  recon_stride,
+                                 unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32, &var, &avg);
+  *sse = var;
+  return (var - (((int64_t)avg * avg) >> 9));
+}
+
+unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr,
+                                           int  src_pixels_per_line,
+                                           int  xoffset,
+                                           int  yoffset,
+                                           const uint8_t *dst_ptr,
+                                           int dst_pixels_per_line,
+                                           unsigned int *sse) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter);
+
+  return vp9_variance16x32_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 32);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter);
+  comp_avg_pred(temp3, second_pred, 16, 32, temp2, 16);
+  return vp9_variance16x32_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance64x64_c(const uint8_t *src_ptr,
                                  int  source_stride,
                                  const uint8_t *ref_ptr,
@@ -89,6 +318,11 @@
   return (var - (((unsigned int)avg * avg) >> 7));
 }
 
+void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride,
+                       const uint8_t *ref_ptr, int ref_stride,
+                       unsigned int *sse, int *sum) {
+  variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);
+}
 
 unsigned int vp9_variance8x8_c(const uint8_t *src_ptr,
                                int  source_stride,
@@ -103,6 +337,32 @@
   return (var - (((unsigned int)avg * avg) >> 6));
 }
 
+unsigned int vp9_variance8x4_c(const uint8_t *src_ptr,
+                               int  source_stride,
+                               const uint8_t *ref_ptr,
+                               int  recon_stride,
+                               unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4, &var, &avg);
+  *sse = var;
+  return (var - (((unsigned int)avg * avg) >> 5));
+}
+
+unsigned int vp9_variance4x8_c(const uint8_t *src_ptr,
+                               int  source_stride,
+                               const uint8_t *ref_ptr,
+                               int  recon_stride,
+                               unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8, &var, &avg);
+  *sse = var;
+  return (var - (((unsigned int)avg * avg) >> 5));
+}
+
 unsigned int vp9_variance4x4_c(const uint8_t *src_ptr,
                                int  source_stride,
                                const uint8_t *ref_ptr,
@@ -130,7 +390,46 @@
   return var;
 }
 
+unsigned int vp9_mse16x8_c(const uint8_t *src_ptr,
+                           int  source_stride,
+                           const uint8_t *ref_ptr,
+                           int  recon_stride,
+                           unsigned int *sse) {
+  unsigned int var;
+  int avg;
 
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
+  *sse = var;
+  return var;
+}
+
+unsigned int vp9_mse8x16_c(const uint8_t *src_ptr,
+                           int  source_stride,
+                           const uint8_t *ref_ptr,
+                           int  recon_stride,
+                           unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
+  *sse = var;
+  return var;
+}
+
+unsigned int vp9_mse8x8_c(const uint8_t *src_ptr,
+                          int  source_stride,
+                          const uint8_t *ref_ptr,
+                          int  recon_stride,
+                          unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
+  *sse = var;
+  return var;
+}
+
+
 unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr,
                                          int  src_pixels_per_line,
                                          int  xoffset,
@@ -139,22 +438,48 @@
                                          int dst_pixels_per_line,
                                          unsigned int *sse) {
   uint8_t temp2[20 * 16];
-  const int16_t *HFilter, *VFilter;
-  uint16_t FData3[5 * 4];  // Temp data bufffer used in filtering
+  const int16_t *hfilter, *vfilter;
+  uint16_t fdata3[5 * 4];  // Temp data bufffer used in filtering
 
-  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
   // First filter 1d Horizontal
-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 5, 4, hfilter);
 
   // Now filter Verticaly
-  var_filter_block2d_bil_second_pass(FData3, temp2, 4,  4,  4,  4, VFilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 4,  4,  4,  4, vfilter);
 
   return vp9_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse,
+                                             const uint8_t *second_pred) {
+  uint8_t temp2[20 * 16];
+  const int16_t *hfilter, *vfilter;
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 4);  // compound pred buffer
+  uint16_t fdata3[5 * 4];  // Temp data bufffer used in filtering
 
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  // First filter 1d Horizontal
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 5, 4, hfilter);
+
+  // Now filter Verticaly
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 4,  4,  4,  4, vfilter);
+  comp_avg_pred(temp3, second_pred, 4, 4, temp2, 4);
+  return vp9_variance4x4_c(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
                                          int  src_pixels_per_line,
                                          int  xoffset,
@@ -162,19 +487,43 @@
                                          const uint8_t *dst_ptr,
                                          int dst_pixels_per_line,
                                          unsigned int *sse) {
-  uint16_t FData3[9 * 8];  // Temp data bufffer used in filtering
+  uint16_t fdata3[9 * 8];  // Temp data bufffer used in filtering
   uint8_t temp2[20 * 16];
-  const int16_t *HFilter, *VFilter;
+  const int16_t *hfilter, *vfilter;
 
-  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
-  var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 9, 8, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter);
 
   return vp9_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse,
+                                             const uint8_t *second_pred) {
+  uint16_t fdata3[9 * 8];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 8);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 9, 8, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter);
+  comp_avg_pred(temp3, second_pred, 8, 8, temp2, 8);
+  return vp9_variance8x8_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
                                            int  src_pixels_per_line,
                                            int  xoffset,
@@ -182,19 +531,44 @@
                                            const uint8_t *dst_ptr,
                                            int dst_pixels_per_line,
                                            unsigned int *sse) {
-  uint16_t FData3[17 * 16];  // Temp data bufffer used in filtering
+  uint16_t fdata3[17 * 16];  // Temp data bufffer used in filtering
   uint8_t temp2[20 * 16];
-  const int16_t *HFilter, *VFilter;
+  const int16_t *hfilter, *vfilter;
 
-  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
-  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter);
 
   return vp9_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[17 * 16];
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter);
+
+  comp_avg_pred(temp3, second_pred, 16, 16, temp2, 16);
+  return vp9_variance16x16_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
                                            int  src_pixels_per_line,
                                            int  xoffset,
@@ -202,20 +576,43 @@
                                            const uint8_t *dst_ptr,
                                            int dst_pixels_per_line,
                                            unsigned int *sse) {
-  uint16_t FData3[65 * 64];  // Temp data bufffer used in filtering
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
   uint8_t temp2[68 * 64];
-  const int16_t *HFilter, *VFilter;
+  const int16_t *hfilter, *vfilter;
 
-  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,
-                                    1, 65, 64, HFilter);
-  var_filter_block2d_bil_second_pass(FData3, temp2, 64, 64, 64, 64, VFilter);
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 65, 64, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter);
 
   return vp9_variance64x64_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 65, 64, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter);
+  comp_avg_pred(temp3, second_pred, 64, 64, temp2, 64);
+  return vp9_variance64x64_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
                                            int  src_pixels_per_line,
                                            int  xoffset,
@@ -223,19 +620,43 @@
                                            const uint8_t *dst_ptr,
                                            int dst_pixels_per_line,
                                            unsigned int *sse) {
-  uint16_t FData3[33 * 32];  // Temp data bufffer used in filtering
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
   uint8_t temp2[36 * 32];
-  const int16_t *HFilter, *VFilter;
+  const int16_t *hfilter, *vfilter;
 
-  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter);
-  var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter);
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter);
 
   return vp9_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 32);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 32, temp2, 32);
+  return vp9_variance32x32_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr,
                                               int  source_stride,
                                               const uint8_t *ref_ptr,
@@ -363,19 +784,43 @@
                                           const uint8_t *dst_ptr,
                                           int dst_pixels_per_line,
                                           unsigned int *sse) {
-  uint16_t FData3[16 * 9];  // Temp data bufffer used in filtering
+  uint16_t fdata3[16 * 9];  // Temp data bufffer used in filtering
   uint8_t temp2[20 * 16];
-  const int16_t *HFilter, *VFilter;
+  const int16_t *hfilter, *vfilter;
 
-  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
-  var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 9, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter);
 
   return vp9_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr,
+                                              int  src_pixels_per_line,
+                                              int  xoffset,
+                                              int  yoffset,
+                                              const uint8_t *dst_ptr,
+                                              int dst_pixels_per_line,
+                                              unsigned int *sse,
+                                              const uint8_t *second_pred) {
+  uint16_t fdata3[16 * 9];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 8);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 9, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter);
+  comp_avg_pred(temp3, second_pred, 16, 8, temp2, 16);
+  return vp9_variance16x8_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
                                           int  src_pixels_per_line,
                                           int  xoffset,
@@ -383,17 +828,129 @@
                                           const uint8_t *dst_ptr,
                                           int dst_pixels_per_line,
                                           unsigned int *sse) {
-  uint16_t FData3[9 * 16];  // Temp data bufffer used in filtering
+  uint16_t fdata3[9 * 16];  // Temp data bufffer used in filtering
   uint8_t temp2[20 * 16];
-  const int16_t *HFilter, *VFilter;
+  const int16_t *hfilter, *vfilter;
 
-  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
-  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
 
-  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,
-                                    1, 17, 8, HFilter);
-  var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 8, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter);
 
   return vp9_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr,
+                                              int  src_pixels_per_line,
+                                              int  xoffset,
+                                              int  yoffset,
+                                              const uint8_t *dst_ptr,
+                                              int dst_pixels_per_line,
+                                              unsigned int *sse,
+                                              const uint8_t *second_pred) {
+  uint16_t fdata3[9 * 16];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 8, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter);
+  comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8);
+  return vp9_variance8x16_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr,
+                                         int  src_pixels_per_line,
+                                         int  xoffset,
+                                         int  yoffset,
+                                         const uint8_t *dst_ptr,
+                                         int dst_pixels_per_line,
+                                         unsigned int *sse) {
+  uint16_t fdata3[8 * 5];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 5, 8, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 4, 8, vfilter);
+
+  return vp9_variance8x4_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse,
+                                             const uint8_t *second_pred) {
+  uint16_t fdata3[8 * 5];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 4);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 5, 8, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 4, 8, vfilter);
+  comp_avg_pred(temp3, second_pred, 8, 4, temp2, 8);
+  return vp9_variance8x4_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr,
+                                         int  src_pixels_per_line,
+                                         int  xoffset,
+                                         int  yoffset,
+                                         const uint8_t *dst_ptr,
+                                         int dst_pixels_per_line,
+                                         unsigned int *sse) {
+  uint16_t fdata3[5 * 8];  // Temp data bufffer used in filtering
+  // FIXME(jingning,rbultje): this temp2 buffer probably doesn't need to be
+  // of this big? same issue appears in all other block size settings.
+  uint8_t temp2[20 * 16];
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 9, 4, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 8, 4, vfilter);
+
+  return vp9_variance4x8_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse,
+                                             const uint8_t *second_pred) {
+  uint16_t fdata3[5 * 8];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 8);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 9, 4, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 8, 4, vfilter);
+  comp_avg_pred(temp3, second_pred, 4, 8, temp2, 4);
+  return vp9_variance4x8_c(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
+}
--- /dev/null
+++ b/vp9/encoder/vp9_write_bit_buffer.h
@@ -1,0 +1,48 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_BIT_WRITE_BUFFER_H_
+#define VP9_BIT_WRITE_BUFFER_H_
+
+#include <limits.h>
+
+#include "vpx/vpx_integer.h"
+
+struct vp9_write_bit_buffer {
+  uint8_t *bit_buffer;
+  size_t bit_offset;
+};
+
+static size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb) {
+  return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
+}
+
+static void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit) {
+  const int off = wb->bit_offset;
+  const int p = off / CHAR_BIT;
+  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+  if (q == CHAR_BIT -1) {
+    wb->bit_buffer[p] = bit << q;
+  } else {
+    wb->bit_buffer[p] &= ~(1 << q);
+    wb->bit_buffer[p] |= bit << q;
+  }
+  wb->bit_offset = off + 1;
+}
+
+static void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb,
+                              int data, int bits) {
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--)
+    vp9_wb_write_bit(wb, (data >> bit) & 1);
+}
+
+
+#endif  // VP9_BIT_WRITE_BUFFER_H_
--- a/vp9/encoder/x86/vp9_encodeopt.asm
+++ b/vp9/encoder/x86/vp9_encodeopt.asm
@@ -123,254 +123,3 @@
     UNSHADOW_ARGS
     pop         rbp
     ret
-
-
-;int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr);
-global sym(vp9_mbblock_error_mmx_impl) PRIVATE
-sym(vp9_mbblock_error_mmx_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov         rsi,        arg(0) ;coeff_ptr
-        pxor        mm7,        mm7
-
-        mov         rdi,        arg(1) ;dcoef_ptr
-        pxor        mm2,        mm2
-
-        mov         rcx,        16
-
-.mberror_loop_mmx:
-        movq        mm3,       [rsi]
-        movq        mm4,       [rdi]
-
-        movq        mm5,       [rsi+8]
-        movq        mm6,       [rdi+8]
-
-
-        psubw       mm5,        mm6
-        pmaddwd     mm5,        mm5
-
-        psubw       mm3,        mm4
-
-        pmaddwd     mm3,        mm3
-        paddd       mm2,        mm5
-
-        paddd       mm2,        mm3
-        movq        mm3,       [rsi+16]
-
-        movq        mm4,       [rdi+16]
-        movq        mm5,       [rsi+24]
-
-        movq        mm6,       [rdi+24]
-        psubw       mm5,        mm6
-
-        pmaddwd     mm5,        mm5
-        psubw       mm3,        mm4
-
-        pmaddwd     mm3,        mm3
-        paddd       mm2,        mm5
-
-        paddd       mm2,        mm3
-        add         rsi,        32
-
-        add         rdi,        32
-        sub         rcx,        1
-
-        jnz         .mberror_loop_mmx
-
-        movq        mm0,        mm2
-        psrlq       mm2,        32
-
-        paddd       mm0,        mm2
-        movq        rax,        mm0
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr);
-global sym(vp9_mbblock_error_xmm_impl) PRIVATE
-sym(vp9_mbblock_error_xmm_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    SAVE_XMM 5
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov         rsi,        arg(0) ;coeff_ptr
-        pxor        xmm5,       xmm5
-
-        mov         rdi,        arg(1) ;dcoef_ptr
-        pxor        xmm4,       xmm4
-
-        mov         rcx,        16
-
-.mberror_loop:
-        movdqa      xmm0,       [rsi]
-        movdqa      xmm1,       [rdi]
-
-        movdqa      xmm2,       [rsi+16]
-        movdqa      xmm3,       [rdi+16]
-
-
-        psubw       xmm2,       xmm3
-        pmaddwd     xmm2,       xmm2
-
-        psubw       xmm0,       xmm1
-
-        pmaddwd     xmm0,       xmm0
-        add         rsi,        32
-
-        add         rdi,        32
-
-        sub         rcx,        1
-        paddd       xmm4,       xmm2
-
-        paddd       xmm4,       xmm0
-        jnz         .mberror_loop
-
-        movdqa      xmm0,       xmm4
-        punpckldq   xmm0,       xmm5
-
-        punpckhdq   xmm4,       xmm5
-        paddd       xmm0,       xmm4
-
-        movdqa      xmm1,       xmm0
-        psrldq      xmm0,       8
-
-        paddd       xmm0,       xmm1
-        movq        rax,        xmm0
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
-global sym(vp9_mbuverror_mmx_impl) PRIVATE
-sym(vp9_mbuverror_mmx_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;s_ptr
-        mov             rdi,        arg(1) ;d_ptr
-
-        mov             rcx,        16
-        pxor            mm7,        mm7
-
-.mbuverror_loop_mmx:
-
-        movq            mm1,        [rsi]
-        movq            mm2,        [rdi]
-
-        psubw           mm1,        mm2
-        pmaddwd         mm1,        mm1
-
-
-        movq            mm3,        [rsi+8]
-        movq            mm4,        [rdi+8]
-
-        psubw           mm3,        mm4
-        pmaddwd         mm3,        mm3
-
-
-        paddd           mm7,        mm1
-        paddd           mm7,        mm3
-
-
-        add             rsi,        16
-        add             rdi,        16
-
-        dec             rcx
-        jnz             .mbuverror_loop_mmx
-
-        movq            mm0,        mm7
-        psrlq           mm7,        32
-
-        paddd           mm0,        mm7
-        movq            rax,        mm0
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
-global sym(vp9_mbuverror_xmm_impl) PRIVATE
-sym(vp9_mbuverror_xmm_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;s_ptr
-        mov             rdi,        arg(1) ;d_ptr
-
-        mov             rcx,        16
-        pxor            xmm3,       xmm3
-
-.mbuverror_loop:
-
-        movdqa          xmm1,       [rsi]
-        movdqa          xmm2,       [rdi]
-
-        psubw           xmm1,       xmm2
-        pmaddwd         xmm1,       xmm1
-
-        paddd           xmm3,       xmm1
-
-        add             rsi,        16
-        add             rdi,        16
-
-        dec             rcx
-        jnz             .mbuverror_loop
-
-        pxor        xmm0,           xmm0
-        movdqa      xmm1,           xmm3
-
-        movdqa      xmm2,           xmm1
-        punpckldq   xmm1,           xmm0
-
-        punpckhdq   xmm2,           xmm0
-        paddd       xmm1,           xmm2
-
-        movdqa      xmm2,           xmm1
-
-        psrldq      xmm1,           8
-        paddd       xmm1,           xmm2
-
-        movq            rax,            xmm1
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
--- a/vp9/encoder/x86/vp9_quantize_mmx.asm
+++ /dev/null
@@ -1,286 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;int vp9_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
-;                           short *qcoeff_ptr,short *dequant_ptr,
-;                           short *scan_mask, short *round_ptr,
-;                           short *quant_ptr, short *dqcoeff_ptr);
-global sym(vp9_fast_quantize_b_impl_mmx) PRIVATE
-sym(vp9_fast_quantize_b_impl_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;coeff_ptr
-        movq            mm0,        [rsi]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm1,        [rax]
-
-        movq            mm3,        mm0
-        psraw           mm0,        15
-
-        pxor            mm3,        mm0
-        psubw           mm3,        mm0         ; abs
-
-        movq            mm2,        mm3
-        pcmpgtw         mm1,        mm2
-
-        pandn           mm1,        mm2
-        movq            mm3,        mm1
-
-        mov             rdx,        arg(6) ;quant_ptr
-        movq            mm1,        [rdx]
-
-        mov             rcx,        arg(5) ;round_ptr
-        movq            mm2,        [rcx]
-
-        paddw           mm3,        mm2
-        pmulhuw         mm3,        mm1
-
-        pxor            mm3,        mm0
-        psubw           mm3,        mm0     ;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-        movq            mm0,        mm3
-
-        movq            [rdi],      mm3
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm2,        [rax]
-
-        pmullw          mm3,        mm2
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax],      mm3
-
-        ; next 8
-        movq            mm4,        [rsi+8]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+8]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+8]
-        movq            mm6,        [rcx+8]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+8],    mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+8]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+8],    mm7
-
-
-                ; next 8
-        movq            mm4,        [rsi+16]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+16]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+16]
-        movq            mm6,        [rcx+16]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+16],   mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+16]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+16],   mm7
-
-
-                ; next 8
-        movq            mm4,        [rsi+24]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+24]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+24]
-        movq            mm6,        [rcx+24]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+24],   mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+24]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+24],   mm7
-
-
-
-        mov             rdi,        arg(4) ;scan_mask
-        mov             rsi,        arg(2) ;qcoeff_ptr
-
-        pxor            mm5,        mm5
-        pxor            mm7,        mm7
-
-        movq            mm0,        [rsi]
-        movq            mm1,        [rsi+8]
-
-        movq            mm2,        [rdi]
-        movq            mm3,        [rdi+8];
-
-        pcmpeqw         mm0,        mm7
-        pcmpeqw         mm1,        mm7
-
-        pcmpeqw         mm6,        mm6
-        pxor            mm0,        mm6
-
-        pxor            mm1,        mm6
-        psrlw           mm0,        15
-
-        psrlw           mm1,        15
-        pmaddwd         mm0,        mm2
-
-        pmaddwd         mm1,        mm3
-        movq            mm5,        mm0
-
-        paddd           mm5,        mm1
-
-        movq            mm0,        [rsi+16]
-        movq            mm1,        [rsi+24]
-
-        movq            mm2,        [rdi+16]
-        movq            mm3,        [rdi+24];
-
-        pcmpeqw         mm0,        mm7
-        pcmpeqw         mm1,        mm7
-
-        pcmpeqw         mm6,        mm6
-        pxor            mm0,        mm6
-
-        pxor            mm1,        mm6
-        psrlw           mm0,        15
-
-        psrlw           mm1,        15
-        pmaddwd         mm0,        mm2
-
-        pmaddwd         mm1,        mm3
-        paddd           mm5,        mm0
-
-        paddd           mm5,        mm1
-        movq            mm0,        mm5
-
-        psrlq           mm5,        32
-        paddd           mm0,        mm5
-
-        ; eob adjustment begins here
-        movq            rcx,        mm0
-        and             rcx,        0xffff
-
-        xor             rdx,        rdx
-        sub             rdx,        rcx ; rdx=-rcx
-
-        bsr             rax,        rcx
-        inc             rax
-
-        sar             rdx,        31
-        and             rax,        rdx
-        ; Substitute the sse assembly for the old mmx mixed assembly/C. The
-        ; following is kept as reference
-        ;    movq            rcx,        mm0
-        ;    bsr             rax,        rcx
-        ;
-        ;    mov             eob,        rax
-        ;    mov             eee,        rcx
-        ;
-        ;if(eee==0)
-        ;{
-        ;    eob=-1;
-        ;}
-        ;else if(eee<0)
-        ;{
-        ;    eob=15;
-        ;}
-        ;d->eob = eob+1;
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
--- a/vp9/encoder/x86/vp9_quantize_sse2.asm
+++ /dev/null
@@ -1,380 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%include "vp9_asm_enc_offsets.asm"
-
-
-; void vp9_regular_quantize_b_sse2 | arg
-;  (BLOCK  *b,                     |  0
-;   BLOCKD *d)                     |  1
-
-global sym(vp9_regular_quantize_b_sse2) PRIVATE
-sym(vp9_regular_quantize_b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SAVE_XMM 7
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %if LIBVPX_YASM_WIN64
-    push        rdi
-    push        rsi
-  %endif
-%endif
-
-    ALIGN_STACK 16, rax
-    %define zrun_zbin_boost   0  ;  8
-    %define abs_minus_zbin    8  ; 32
-    %define temp_qcoeff       40 ; 32
-    %define qcoeff            72 ; 32
-    %define stack_size        104
-    sub         rsp, stack_size
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rdx, [rdi + vp9_block_coeff] ; coeff_ptr
-    mov         rcx, [rdi + vp9_block_zbin] ; zbin_ptr
-    movd        xmm7, [rdi + vp9_block_zbin_extra] ; zbin_oq_value
-
-    ; z
-    movdqa      xmm0, [rdx]
-    movdqa      xmm4, [rdx + 16]
-    mov         rdx, [rdi + vp9_block_round] ; round_ptr
-
-    pshuflw     xmm7, xmm7, 0
-    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    ; (z ^ sz)
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-
-    ; x = abs(z)
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    movdqa      xmm2, [rcx]
-    movdqa      xmm3, [rcx + 16]
-    mov         rcx, [rdi + vp9_block_quant] ; quant_ptr
-
-    ; *zbin_ptr + zbin_oq_value
-    paddw       xmm2, xmm7
-    paddw       xmm3, xmm7
-
-    ; x - (*zbin_ptr + zbin_oq_value)
-    psubw       xmm1, xmm2
-    psubw       xmm5, xmm3
-    movdqa      [rsp + abs_minus_zbin], xmm1
-    movdqa      [rsp + abs_minus_zbin + 16], xmm5
-
-    ; add (zbin_ptr + zbin_oq_value) back
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm3
-
-    movdqa      xmm2, [rdx]
-    movdqa      xmm6, [rdx + 16]
-
-    movdqa      xmm3, [rcx]
-    movdqa      xmm7, [rcx + 16]
-
-    ; x + round
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm6
-
-    ; y = x * quant_ptr >> 16
-    pmulhw      xmm3, xmm1
-    pmulhw      xmm7, xmm5
-
-    ; y += x
-    paddw       xmm1, xmm3
-    paddw       xmm5, xmm7
-
-    movdqa      [rsp + temp_qcoeff], xmm1
-    movdqa      [rsp + temp_qcoeff + 16], xmm5
-
-    pxor        xmm6, xmm6
-    ; zero qcoeff
-    movdqa      [rsp + qcoeff], xmm6
-    movdqa      [rsp + qcoeff + 16], xmm6
-
-    mov         rdx, [rdi + vp9_block_zrun_zbin_boost] ; zbin_boost_ptr
-    mov         rax, [rdi + vp9_block_quant_shift] ; quant_shift_ptr
-    mov         [rsp + zrun_zbin_boost], rdx
-
-%macro ZIGZAG_LOOP 1
-    ; x
-    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
-
-    ; if (x >= zbin)
-    sub         cx, WORD PTR[rdx]           ; x - zbin
-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
-    jl          .rq_zigzag_loop_%1           ; x < zbin
-
-    movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
-
-    ; downshift by quant_shift[rc]
-    movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]
-    sar         edi, cl                     ; also sets Z bit
-    je          .rq_zigzag_loop_%1           ; !y
-    mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
-    mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
-.rq_zigzag_loop_%1:
-%endmacro
-; in vp9_default_zig_zag1d order: see vp9/common/vp9_entropy.c
-ZIGZAG_LOOP  0
-ZIGZAG_LOOP  1
-ZIGZAG_LOOP  4
-ZIGZAG_LOOP  8
-ZIGZAG_LOOP  5
-ZIGZAG_LOOP  2
-ZIGZAG_LOOP  3
-ZIGZAG_LOOP  6
-ZIGZAG_LOOP  9
-ZIGZAG_LOOP 12
-ZIGZAG_LOOP 13
-ZIGZAG_LOOP 10
-ZIGZAG_LOOP  7
-ZIGZAG_LOOP 11
-ZIGZAG_LOOP 14
-ZIGZAG_LOOP 15
-
-    movdqa      xmm2, [rsp + qcoeff]
-    movdqa      xmm3, [rsp + qcoeff + 16]
-
-    mov         rcx, [rsi + vp9_blockd_dequant] ; dequant_ptr
-    mov         rdi, [rsi + vp9_blockd_dqcoeff] ; dqcoeff_ptr
-
-    ; y ^ sz
-    pxor        xmm2, xmm0
-    pxor        xmm3, xmm4
-    ; x = (y ^ sz) - sz
-    psubw       xmm2, xmm0
-    psubw       xmm3, xmm4
-
-    ; dequant
-    movdqa      xmm0, [rcx]
-    movdqa      xmm1, [rcx + 16]
-
-    mov         rcx, [rsi + vp9_blockd_qcoeff] ; qcoeff_ptr
-
-    pmullw      xmm0, xmm2
-    pmullw      xmm1, xmm3
-
-    movdqa      [rcx], xmm2        ; store qcoeff
-    movdqa      [rcx + 16], xmm3
-    movdqa      [rdi], xmm0        ; store dqcoeff
-    movdqa      [rdi + 16], xmm1
-
-    ; select the last value (in zig_zag order) for EOB
-    pcmpeqw     xmm2, xmm6
-    pcmpeqw     xmm3, xmm6
-    ; !
-    pcmpeqw     xmm6, xmm6
-    pxor        xmm2, xmm6
-    pxor        xmm3, xmm6
-    ; mask inv_zig_zag
-    pand        xmm2, [GLOBAL(inv_zig_zag)]
-    pand        xmm3, [GLOBAL(inv_zig_zag + 16)]
-    ; select the max value
-    pmaxsw      xmm2, xmm3
-    pshufd      xmm3, xmm2, 00001110b
-    pmaxsw      xmm2, xmm3
-    pshuflw     xmm3, xmm2, 00001110b
-    pmaxsw      xmm2, xmm3
-    pshuflw     xmm3, xmm2, 00000001b
-    pmaxsw      xmm2, xmm3
-    movd        eax, xmm2
-    and         eax, 0xff
-    mov         [rsi + vp9_blockd_eob], eax
-
-    ; begin epilog
-    add         rsp, stack_size
-    pop         rsp
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-    RESTORE_GOT
-    RESTORE_XMM
-    pop         rbp
-    ret
-
-; void vp9_fast_quantize_b_sse2 | arg
-;  (BLOCK  *b,                  |  0
-;   BLOCKD *d)                  |  1
-
-global sym(vp9_fast_quantize_b_sse2) PRIVATE
-sym(vp9_fast_quantize_b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %if LIBVPX_YASM_WIN64
-    push        rdi
-    push        rsi
-  %else
-    ; these registers are used for passing arguments
-  %endif
-%endif
-
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp9_block_coeff]
-    mov         rcx, [rdi + vp9_block_round]
-    mov         rdx, [rdi + vp9_block_quant_fast]
-
-    ; z = coeff
-    movdqa      xmm0, [rax]
-    movdqa      xmm4, [rax + 16]
-
-    ; dup z so we can save sz
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz = z >> 15
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    ; x = abs(z) = (z ^ sz) - sz
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    ; x += round
-    paddw       xmm1, [rcx]
-    paddw       xmm5, [rcx + 16]
-
-    mov         rax, [rsi + vp9_blockd_qcoeff]
-    mov         rcx, [rsi + vp9_blockd_dequant]
-    mov         rdi, [rsi + vp9_blockd_dqcoeff]
-
-    ; y = x * quant >> 16
-    pmulhw      xmm1, [rdx]
-    pmulhw      xmm5, [rdx + 16]
-
-    ; x = (y ^ sz) - sz
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    ; qcoeff = x
-    movdqa      [rax], xmm1
-    movdqa      [rax + 16], xmm5
-
-    ; x * dequant
-    movdqa      xmm2, xmm1
-    movdqa      xmm3, xmm5
-    pmullw      xmm2, [rcx]
-    pmullw      xmm3, [rcx + 16]
-
-    ; dqcoeff = x * dequant
-    movdqa      [rdi], xmm2
-    movdqa      [rdi + 16], xmm3
-
-    pxor        xmm4, xmm4                  ;clear all bits
-    pcmpeqw     xmm1, xmm4
-    pcmpeqw     xmm5, xmm4
-
-    pcmpeqw     xmm4, xmm4                  ;set all bits
-    pxor        xmm1, xmm4
-    pxor        xmm5, xmm4
-
-    pand        xmm1, [GLOBAL(inv_zig_zag)]
-    pand        xmm5, [GLOBAL(inv_zig_zag + 16)]
-
-    pmaxsw      xmm1, xmm5
-
-    ; now down to 8
-    pshufd      xmm5, xmm1, 00001110b
-
-    pmaxsw      xmm1, xmm5
-
-    ; only 4 left
-    pshuflw     xmm5, xmm1, 00001110b
-
-    pmaxsw      xmm1, xmm5
-
-    ; okay, just 2!
-    pshuflw     xmm5, xmm1, 00000001b
-
-    pmaxsw      xmm1, xmm5
-
-    movd        eax, xmm1
-    and         eax, 0xff
-    mov         [rsi + vp9_blockd_eob], eax
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-
-    RESTORE_GOT
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-inv_zig_zag:
-  dw 0x0001, 0x0002, 0x0006, 0x0007
-  dw 0x0003, 0x0005, 0x0008, 0x000d
-  dw 0x0004, 0x0009, 0x000c, 0x000e
-  dw 0x000a, 0x000b, 0x000f, 0x0010
--- a/vp9/encoder/x86/vp9_quantize_sse4.asm
+++ /dev/null
@@ -1,254 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%include "vp9_asm_enc_offsets.asm"
-
-
-; void vp9_regular_quantize_b_sse4 | arg
-;  (BLOCK  *b,                     |  0
-;   BLOCKD *d)                     |  1
-
-global sym(vp9_regular_quantize_b_sse4) PRIVATE
-sym(vp9_regular_quantize_b_sse4):
-
-%if ABI_IS_32BIT
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-    push        rdi
-    push        rsi
-
-    ALIGN_STACK 16, rax
-    %define qcoeff      0 ; 32
-    %define stack_size 32
-    sub         rsp, stack_size
-%else
-  %if LIBVPX_YASM_WIN64
-    SAVE_XMM 8, u
-    push        rdi
-    push        rsi
-  %endif
-%endif
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp9_block_coeff]
-    mov         rcx, [rdi + vp9_block_zbin]
-    mov         rdx, [rdi + vp9_block_round]
-    movd        xmm7, [rdi + vp9_block_zbin_extra]
-
-    ; z
-    movdqa      xmm0, [rax]
-    movdqa      xmm1, [rax + 16]
-
-    ; duplicate zbin_oq_value
-    pshuflw     xmm7, xmm7, 0
-    punpcklwd   xmm7, xmm7
-
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm1
-
-    ; sz
-    psraw       xmm0, 15
-    psraw       xmm1, 15
-
-    ; (z ^ sz)
-    pxor        xmm2, xmm0
-    pxor        xmm3, xmm1
-
-    ; x = abs(z)
-    psubw       xmm2, xmm0
-    psubw       xmm3, xmm1
-
-    ; zbin
-    movdqa      xmm4, [rcx]
-    movdqa      xmm5, [rcx + 16]
-
-    ; *zbin_ptr + zbin_oq_value
-    paddw       xmm4, xmm7
-    paddw       xmm5, xmm7
-
-    movdqa      xmm6, xmm2
-    movdqa      xmm7, xmm3
-
-    ; x - (*zbin_ptr + zbin_oq_value)
-    psubw       xmm6, xmm4
-    psubw       xmm7, xmm5
-
-    ; round
-    movdqa      xmm4, [rdx]
-    movdqa      xmm5, [rdx + 16]
-
-    mov         rax, [rdi + vp9_block_quant_shift]
-    mov         rcx, [rdi + vp9_block_quant]
-    mov         rdx, [rdi + vp9_block_zrun_zbin_boost]
-
-    ; x + round
-    paddw       xmm2, xmm4
-    paddw       xmm3, xmm5
-
-    ; quant
-    movdqa      xmm4, [rcx]
-    movdqa      xmm5, [rcx + 16]
-
-    ; y = x * quant_ptr >> 16
-    pmulhw      xmm4, xmm2
-    pmulhw      xmm5, xmm3
-
-    ; y += x
-    paddw       xmm2, xmm4
-    paddw       xmm3, xmm5
-
-    pxor        xmm4, xmm4
-%if ABI_IS_32BIT
-    movdqa      [rsp + qcoeff], xmm4
-    movdqa      [rsp + qcoeff + 16], xmm4
-%else
-    pxor        xmm8, xmm8
-%endif
-
-    ; quant_shift
-    movdqa      xmm5, [rax]
-
-    ; zrun_zbin_boost
-    mov         rax, rdx
-
-%macro ZIGZAG_LOOP 5
-    ; x
-    pextrw      ecx, %4, %2
-
-    ; if (x >= zbin)
-    sub         cx, WORD PTR[rdx]           ; x - zbin
-    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
-    jl          .rq_zigzag_loop_%1          ; x < zbin
-
-    pextrw      edi, %3, %2                 ; y
-
-    ; downshift by quant_shift[rc]
-    pextrb      ecx, xmm5, %1               ; quant_shift[rc]
-    sar         edi, cl                     ; also sets Z bit
-    je          .rq_zigzag_loop_%1          ; !y
-%if ABI_IS_32BIT
-    mov         WORD PTR[rsp + qcoeff + %1 *2], di
-%else
-    pinsrw      %5, edi, %2                 ; qcoeff[rc]
-%endif
-    mov         rdx, rax                    ; reset to b->zrun_zbin_boost
-.rq_zigzag_loop_%1:
-%endmacro
-; in vp9_default_zig_zag1d order: see vp9/common/vp9_entropy.c
-ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  1, 1, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  4, 4, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  8, 0, xmm3, xmm7, xmm8
-ZIGZAG_LOOP  5, 5, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  2, 2, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  3, 3, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  6, 6, xmm2, xmm6, xmm4
-ZIGZAG_LOOP  9, 1, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
-ZIGZAG_LOOP  7, 7, xmm2, xmm6, xmm4
-ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
-ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
-
-    mov         rcx, [rsi + vp9_blockd_dequant]
-    mov         rdi, [rsi + vp9_blockd_dqcoeff]
-
-%if ABI_IS_32BIT
-    movdqa      xmm4, [rsp + qcoeff]
-    movdqa      xmm5, [rsp + qcoeff + 16]
-%else
-    %define     xmm5 xmm8
-%endif
-
-    ; y ^ sz
-    pxor        xmm4, xmm0
-    pxor        xmm5, xmm1
-    ; x = (y ^ sz) - sz
-    psubw       xmm4, xmm0
-    psubw       xmm5, xmm1
-
-    ; dequant
-    movdqa      xmm0, [rcx]
-    movdqa      xmm1, [rcx + 16]
-
-    mov         rcx, [rsi + vp9_blockd_qcoeff]
-
-    pmullw      xmm0, xmm4
-    pmullw      xmm1, xmm5
-
-    ; store qcoeff
-    movdqa      [rcx], xmm4
-    movdqa      [rcx + 16], xmm5
-
-    ; store dqcoeff
-    movdqa      [rdi], xmm0
-    movdqa      [rdi + 16], xmm1
-
-    ; select the last value (in zig_zag order) for EOB
-    pxor        xmm6, xmm6
-    pcmpeqw     xmm4, xmm6
-    pcmpeqw     xmm5, xmm6
-
-    packsswb    xmm4, xmm5
-    pshufb      xmm4, [GLOBAL(zig_zag1d)]
-    pmovmskb    edx, xmm4
-    xor         rdi, rdi
-    mov         eax, -1
-    xor         dx, ax
-    bsr         eax, edx
-    sub         edi, edx
-    sar         edi, 31
-    add         eax, 1
-    and         eax, edi
-
-    mov         [rsi + vp9_blockd_eob], eax
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    add         rsp, stack_size
-    pop         rsp
-
-    pop         rsi
-    pop         rdi
-    RESTORE_GOT
-    pop         rbp
-%else
-  %undef xmm5
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-    RESTORE_XMM
-  %endif
-%endif
-
-    ret
-
-SECTION_RODATA
-align 16
-; vp9/common/vp9_entropy.c: vp9_default_zig_zag1d
-zig_zag1d:
-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ /dev/null
@@ -1,138 +1,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-%include "vp9_asm_enc_offsets.asm"
-
-
-; void vp9_fast_quantize_b_ssse3 | arg
-;  (BLOCK  *b,                   |  0
-;   BLOCKD *d)                   |  1
-;
-
-global sym(vp9_fast_quantize_b_ssse3) PRIVATE
-sym(vp9_fast_quantize_b_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %if LIBVPX_YASM_WIN64
-    push        rdi
-    push        rsi
-  %endif
-%endif
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %if LIBVPX_YASM_WIN64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp9_block_coeff]
-    mov         rcx, [rdi + vp9_block_round]
-    mov         rdx, [rdi + vp9_block_quant_fast]
-
-    ; coeff
-    movdqa      xmm0, [rax]
-    movdqa      xmm4, [rax + 16]
-
-    ; round
-    movdqa      xmm2, [rcx]
-    movdqa      xmm3, [rcx + 16]
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz = z >> 15
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    pabsw       xmm1, xmm1
-    pabsw       xmm5, xmm5
-
-    paddw       xmm1, xmm2
-    paddw       xmm5, xmm3
-
-    ; quant_fast
-    pmulhw      xmm1, [rdx]
-    pmulhw      xmm5, [rdx + 16]
-
-    mov         rax, [rsi + vp9_blockd_qcoeff]
-    mov         rdi, [rsi + vp9_blockd_dequant]
-    mov         rcx, [rsi + vp9_blockd_dqcoeff]
-
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    movdqa      [rax], xmm1
-    movdqa      [rax + 16], xmm5
-
-    movdqa      xmm2, [rdi]
-    movdqa      xmm3, [rdi + 16]
-
-    pxor        xmm4, xmm4
-    pmullw      xmm2, xmm1
-    pmullw      xmm3, xmm5
-
-    pcmpeqw     xmm1, xmm4                  ;non zero mask
-    pcmpeqw     xmm5, xmm4                  ;non zero mask
-    packsswb    xmm1, xmm5
-    pshufb      xmm1, [GLOBAL(zz_shuf)]
-
-    pmovmskb    edx, xmm1
-
-    xor         rdi, rdi
-    mov         eax, -1
-    xor         dx, ax                      ;flip the bits for bsr
-    bsr         eax, edx
-
-    movdqa      [rcx], xmm2                 ;store dqcoeff
-    movdqa      [rcx + 16], xmm3            ;store dqcoeff
-
-    sub         edi, edx                    ;check for all zeros in bit mask
-    sar         edi, 31                     ;0 or -1
-    add         eax, 1
-    and         eax, edi                    ;if the bit mask was all zero,
-                                            ;then eob = 0
-    mov         [rsi + vp9_blockd_eob], eax
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-
-    RESTORE_GOT
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-zz_shuf:
-    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
--- a/vp9/encoder/x86/vp9_quantize_x86.h
+++ /dev/null
@@ -1,48 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license and patent
- *  grant that can be found in the LICENSE file in the root of the source
- *  tree. All contributing project authors may be found in the AUTHORS
- *  file in the root of the source tree.
- */
-
-#ifndef VP9_ENCODER_X86_VP9_QUANTIZE_X86_H_
-#define VP9_ENCODER_X86_VP9_QUANTIZE_X86_H_
-
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-#if HAVE_MMX
-
-#endif /* HAVE_MMX */
-
-
-#if HAVE_SSE2
-extern prototype_quantize_block(vp9_regular_quantize_b_sse2);
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef vp9_quantize_quantb
-#define vp9_quantize_quantb vp9_regular_quantize_b_sse2
-#endif /* !CONFIG_RUNTIME_CPU_DETECT */
-
-#endif /* HAVE_SSE2 */
-
-
-#if HAVE_SSE4_1
-extern prototype_quantize_block(vp9_regular_quantize_b_sse4);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-#undef vp9_quantize_quantb
-#define vp9_quantize_quantb vp9_regular_quantize_b_sse4
-
-#endif /* !CONFIG_RUNTIME_CPU_DETECT */
-
-#endif /* HAVE_SSE4_1 */
-
-#endif /* QUANTIZE_X86_H */
--- a/vp9/encoder/x86/vp9_sad4d_sse2.asm
+++ b/vp9/encoder/x86/vp9_sad4d_sse2.asm
@@ -215,7 +215,11 @@
 
 INIT_XMM sse2
 SADNXN4D 64, 64
+SADNXN4D 64, 32
+SADNXN4D 32, 64
 SADNXN4D 32, 32
+SADNXN4D 32, 16
+SADNXN4D 16, 32
 SADNXN4D 16, 16
 SADNXN4D 16,  8
 SADNXN4D  8, 16
--- a/vp9/encoder/x86/vp9_sad_sse2.asm
+++ b/vp9/encoder/x86/vp9_sad_sse2.asm
@@ -14,11 +14,11 @@
 
 ; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
-INIT_XMM sse2
-cglobal sad64x64, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
+%macro SAD64XN 1
+cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
-  mov              n_rowsd, 64
+  mov              n_rowsd, %1
   pxor                  m0, m0
 .loop:
   movu                  m1, [refq]
@@ -42,14 +42,19 @@
   paddd                 m0, m1
   movd                 eax, m0
   RET
+%endmacro
 
+INIT_XMM sse2
+SAD64XN 64 ; sad64x64_sse2
+SAD64XN 32 ; sad64x32_sse2
+
 ; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
-INIT_XMM sse2
-cglobal sad32x32, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
+%macro SAD32XN 1
+cglobal sad32x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
-  mov              n_rowsd, 16
+  mov              n_rowsd, %1/2
   pxor                  m0, m0
 
 .loop:
@@ -74,7 +79,13 @@
   paddd                 m0, m1
   movd                 eax, m0
   RET
+%endmacro
 
+INIT_XMM sse2
+SAD32XN 64 ; sad32x64_sse2
+SAD32XN 32 ; sad32x32_sse2
+SAD32XN 16 ; sad32x16_sse2
+
 ; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro SAD16XN 1
@@ -112,6 +123,7 @@
 %endmacro
 
 INIT_XMM sse2
+SAD16XN 32 ; sad16x32_sse2
 SAD16XN 16 ; sad16x16_sse2
 SAD16XN  8 ; sad16x8_sse2
 
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@@ -139,8 +139,38 @@
 
 DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]);
 
-unsigned int vp9_variance4x4_wmt(
+typedef unsigned int (*get_var_sse2) (
   const unsigned char *src_ptr,
+  int source_stride,
+  const unsigned char *ref_ptr,
+  int recon_stride,
+  unsigned int *SSE,
+  int *Sum
+);
+
+static void variance_sse2(const unsigned char *src_ptr, int  source_stride,
+                        const unsigned char *ref_ptr, int  recon_stride,
+                        int  w, int  h, unsigned int *sse, int *sum,
+                        get_var_sse2 var_fn, int block_size) {
+  unsigned int sse0;
+  int sum0;
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      var_fn(src_ptr + source_stride * i + j, source_stride,
+             ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+unsigned int vp9_variance4x4_sse2(
+  const unsigned char *src_ptr,
   int  source_stride,
   const unsigned char *ref_ptr,
   int  recon_stride,
@@ -148,13 +178,41 @@
   unsigned int var;
   int avg;
 
-  vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4,
+                  &var, &avg, vp9_get4x4var_mmx, 4);
   *sse = var;
   return (var - (((unsigned int)avg * avg) >> 4));
+}
 
+unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr,
+                                  int  source_stride,
+                                  const uint8_t *ref_ptr,
+                                  int  recon_stride,
+                                  unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4,
+                  &var, &avg, vp9_get4x4var_mmx, 4);
+  *sse = var;
+  return (var - (((unsigned int)avg * avg) >> 5));
 }
 
-unsigned int vp9_variance8x8_wmt
+unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr,
+                                  int  source_stride,
+                                  const uint8_t *ref_ptr,
+                                  int  recon_stride,
+                                  unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8,
+                  &var, &avg, vp9_get4x4var_mmx, 4);
+  *sse = var;
+  return (var - (((unsigned int)avg * avg) >> 5));
+}
+
+unsigned int vp9_variance8x8_sse2
 (
   const unsigned char *src_ptr,
   int  source_stride,
@@ -164,14 +222,13 @@
   unsigned int var;
   int avg;
 
-  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8,
+                  &var, &avg, vp9_get8x8var_sse2, 8);
   *sse = var;
   return (var - (((unsigned int)avg * avg) >> 6));
-
 }
 
-
-unsigned int vp9_variance16x16_wmt
+unsigned int vp9_variance16x8_sse2
 (
   const unsigned char *src_ptr,
   int  source_stride,
@@ -178,32 +235,32 @@
   const unsigned char *ref_ptr,
   int  recon_stride,
   unsigned int *sse) {
-  unsigned int sse0;
-  int sum0;
+  unsigned int var;
+  int avg;
 
-
-  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  *sse = sse0;
-  return (sse0 - (((unsigned int)sum0 * sum0) >> 8));
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8,
+                  &var, &avg, vp9_get8x8var_sse2, 8);
+  *sse = var;
+  return (var - (((unsigned int)avg * avg) >> 7));
 }
 
-unsigned int vp9_mse16x16_wmt(
+unsigned int vp9_variance8x16_sse2
+(
   const unsigned char *src_ptr,
   int  source_stride,
   const unsigned char *ref_ptr,
   int  recon_stride,
   unsigned int *sse) {
+  unsigned int var;
+  int avg;
 
-  unsigned int sse0;
-  int sum0;
-  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  *sse = sse0;
-  return sse0;
-
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16,
+                &var, &avg, vp9_get8x8var_sse2, 8);
+  *sse = var;
+  return (var - (((unsigned int)avg * avg) >> 7));
 }
 
-
-unsigned int vp9_variance16x8_wmt
+unsigned int vp9_variance16x16_sse2
 (
   const unsigned char *src_ptr,
   int  source_stride,
@@ -210,37 +267,112 @@
   const unsigned char *ref_ptr,
   int  recon_stride,
   unsigned int *sse) {
-  unsigned int sse0, sse1, var;
-  int sum0, sum1, avg;
+  unsigned int var;
+  int avg;
 
-  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-
-  var = sse0 + sse1;
-  avg = sum0 + sum1;
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
+                &var, &avg, vp9_get16x16var_sse2, 16);
   *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 7));
-
+  return (var - (((unsigned int)avg * avg) >> 8));
 }
 
-unsigned int vp9_variance8x16_wmt
-(
+unsigned int vp9_mse16x16_wmt(
   const unsigned char *src_ptr,
   int  source_stride,
   const unsigned char *ref_ptr,
   int  recon_stride,
   unsigned int *sse) {
-  unsigned int sse0, sse1, var;
-  int sum0, sum1, avg;
 
-  vp9_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
-  vp9_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
+  unsigned int sse0;
+  int sum0;
+  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+                       &sum0);
+  *sse = sse0;
+  return sse0;
+}
 
-  var = sse0 + sse1;
-  avg = sum0 + sum1;
+unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr,
+                                    int  source_stride,
+                                    const uint8_t *ref_ptr,
+                                    int  recon_stride,
+                                    unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
+                &var, &avg, vp9_get16x16var_sse2, 16);
   *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 7));
+  return (var - (((int64_t)avg * avg) >> 10));
+}
 
+unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr,
+                                    int  source_stride,
+                                    const uint8_t *ref_ptr,
+                                    int  recon_stride,
+                                    unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
+                &var, &avg, vp9_get16x16var_sse2, 16);
+  *sse = var;
+  return (var - (((int64_t)avg * avg) >> 9));
+}
+
+unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr,
+                                    int  source_stride,
+                                    const uint8_t *ref_ptr,
+                                    int  recon_stride,
+                                    unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32,
+                &var, &avg, vp9_get16x16var_sse2, 16);
+  *sse = var;
+  return (var - (((int64_t)avg * avg) >> 9));
+}
+
+unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr,
+                                    int  source_stride,
+                                    const uint8_t *ref_ptr,
+                                    int  recon_stride,
+                                    unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
+                &var, &avg, vp9_get16x16var_sse2, 16);
+  *sse = var;
+  return (var - (((int64_t)avg * avg) >> 12));
+}
+
+unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr,
+                                    int  source_stride,
+                                    const uint8_t *ref_ptr,
+                                    int  recon_stride,
+                                    unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
+                &var, &avg, vp9_get16x16var_sse2, 16);
+  *sse = var;
+  return (var - (((int64_t)avg * avg) >> 11));
+}
+
+unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
+                                    int  source_stride,
+                                    const uint8_t *ref_ptr,
+                                    int  recon_stride,
+                                    unsigned int *sse) {
+  unsigned int var;
+  int avg;
+
+  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64,
+                &var, &avg, vp9_get16x16var_sse2, 16);
+  *sse = var;
+  return (var - (((int64_t)avg * avg) >> 11));
 }
 
 unsigned int vp9_sub_pixel_variance4x4_wmt
--- a/vp9/encoder/x86/vp9_variance_ssse3.c
+++ b/vp9/encoder/x86/vp9_variance_ssse3.c
@@ -15,15 +15,6 @@
 
 #define HALFNDX 8
 
-extern unsigned int vp9_get16x16var_sse2
-(
-  const unsigned char *src_ptr,
-  int source_stride,
-  const unsigned char *ref_ptr,
-  int recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
 extern void vp9_half_horiz_vert_variance16x_h_sse2
 (
   const unsigned char *ref_ptr,
--- a/vp9/encoder/x86/vp9_x86_csystemdependent.c
+++ b/vp9/encoder/x86/vp9_x86_csystemdependent.c
@@ -17,26 +17,12 @@
 
 // TODO(jimbankoski) Consider rewriting the c to take the same values rather
 // than going through these pointer conversions
-#if HAVE_MMX
+#if 0 && HAVE_MMX
 void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) {
   vp9_short_fdct4x4_mmx(input,   output,    pitch);
   vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch);
 }
 
-int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr);
-int vp9_mbblock_error_mmx(MACROBLOCK *mb) {
-  short *coeff_ptr =  mb->block[0].coeff;
-  short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
-  return vp9_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr);
-}
-
-int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
-int vp9_mbuverror_mmx(MACROBLOCK *mb) {
-  short *s_ptr = &mb->coeff[256];
-  short *d_ptr = &mb->e_mbd.dqcoeff[256];
-  return vp9_mbuverror_mmx_impl(s_ptr, d_ptr);
-}
-
 void vp9_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
                              short *diff, unsigned char *predictor,
                              int pitch);
@@ -44,27 +30,15 @@
   unsigned char *z = *(be->base_src) + be->src;
   unsigned int  src_stride = be->src_stride;
   short *diff = &be->src_diff[0];
-  unsigned char *predictor = &bd->predictor[0];
+  unsigned char *predictor = *(bd->base_dst) + bd->dst;
+  // TODO(jingning): The prototype function in c has been changed. Need to
+  // modify the mmx and sse versions.
   vp9_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
 }
 
 #endif
 
-#if HAVE_SSE2
-int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr);
-int vp9_mbblock_error_xmm(MACROBLOCK *mb) {
-  short *coeff_ptr =  mb->block[0].coeff;
-  short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
-  return vp9_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr);
-}
-
-int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
-int vp9_mbuverror_xmm(MACROBLOCK *mb) {
-  short *s_ptr = &mb->coeff[256];
-  short *d_ptr = &mb->e_mbd.dqcoeff[256];
-  return vp9_mbuverror_xmm_impl(s_ptr, d_ptr);
-}
-
+#if 0 && HAVE_SSE2
 void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
                               short *diff, unsigned char *predictor,
                               int pitch);
@@ -72,7 +46,9 @@
   unsigned char *z = *(be->base_src) + be->src;
   unsigned int  src_stride = be->src_stride;
   short *diff = &be->src_diff[0];
-  unsigned char *predictor = &bd->predictor[0];
+  unsigned char *predictor = *(bd->base_dst) + bd->dst;
+  // TODO(jingning): The prototype function in c has been changed. Need to
+  // modify the mmx and sse versions.
   vp9_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
 }
 
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -15,7 +15,6 @@
 VP9_COMMON_SRCS-yes += common/vp9_onyx.h
 VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c
 VP9_COMMON_SRCS-yes += common/vp9_asm_com_offsets.c
-VP9_COMMON_SRCS-yes += common/vp9_blockd.c
 VP9_COMMON_SRCS-yes += common/vp9_coefupdateprobs.h
 VP9_COMMON_SRCS-yes += common/vp9_convolve.c
 VP9_COMMON_SRCS-yes += common/vp9_convolve.h
@@ -36,9 +35,9 @@
 VP9_COMMON_SRCS-yes += common/vp9_entropy.h
 VP9_COMMON_SRCS-yes += common/vp9_entropymode.h
 VP9_COMMON_SRCS-yes += common/vp9_entropymv.h
+VP9_COMMON_SRCS-yes += common/vp9_enums.h
 VP9_COMMON_SRCS-yes += common/vp9_extend.h
 VP9_COMMON_SRCS-yes += common/vp9_findnearmv.h
-VP9_COMMON_SRCS-yes += common/vp9_header.h
 VP9_COMMON_SRCS-yes += common/vp9_idct.h
 VP9_COMMON_SRCS-yes += common/vp9_invtrans.h
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h
@@ -56,8 +55,6 @@
 VP9_COMMON_SRCS-yes += common/vp9_subpelvar.h
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.c
-VP9_COMMON_SRCS-yes += common/vp9_setupintrarecon.h
-VP9_COMMON_SRCS-yes += common/vp9_swapyv12buffer.h
 VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h
 VP9_COMMON_SRCS-yes += common/vp9_textblit.h
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.h
@@ -72,12 +69,8 @@
 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c
 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h
 VP9_COMMON_SRCS-yes += common/vp9_quant_common.c
-VP9_COMMON_SRCS-yes += common/vp9_recon.c
 VP9_COMMON_SRCS-yes += common/vp9_reconinter.c
 VP9_COMMON_SRCS-yes += common/vp9_reconintra.c
-VP9_COMMON_SRCS-yes += common/vp9_reconintra4x4.c
-VP9_COMMON_SRCS-yes += common/vp9_setupintrarecon.c
-VP9_COMMON_SRCS-yes += common/vp9_swapyv12buffer.c
 VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/vp9_textblit.c
 VP9_COMMON_SRCS-yes += common/vp9_treecoder.c
 VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/vp9_implicit_segmentation.c
@@ -92,7 +85,6 @@
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm
@@ -113,14 +105,6 @@
 
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_sse2.c
-ifeq ($(HAVE_SSE2),yes)
-vp9/common/x86/vp9_idct_intrin_sse2.c.o: CFLAGS += -msse2
-vp9/common/x86/vp9_loopfilter_intrin_sse2.c.o: CFLAGS += -msse2
-vp9/common/x86/vp9_sadmxn_sse2.c.o: CFLAGS += -msse2
-vp9/common/x86/vp9_idct_intrin_sse2.c.d: CFLAGS += -msse2
-vp9/common/x86/vp9_loopfilter_intrin_sse2.c.d: CFLAGS += -msse2
-vp9/common/x86/vp9_sadmxn_sse2.c.d: CFLAGS += -msse2
-endif
 
 $(eval $(call asm_offsets_template,\
          vp9_asm_com_offsets.asm, $(VP9_PREFIX)common/vp9_asm_com_offsets.c))
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -20,7 +20,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-struct vp8_extracfg {
+struct vp9_extracfg {
   struct vpx_codec_pkt_list *pkt_list;
   int                         cpu_used;                    /** available cpu percentage in 1/16*/
   unsigned int                enable_auto_alt_ref;           /** if encoder decides to uses alternate reference frame */
@@ -42,7 +42,7 @@
 
 struct extraconfig_map {
   int                 usage;
-  struct vp8_extracfg cfg;
+  struct vp9_extracfg cfg;
 };
 
 static const struct extraconfig_map extracfg_map[] = {
@@ -73,7 +73,7 @@
 struct vpx_codec_alg_priv {
   vpx_codec_priv_t        base;
   vpx_codec_enc_cfg_t     cfg;
-  struct vp8_extracfg     vp8_cfg;
+  struct vp9_extracfg     vp8_cfg;
   VP9_CONFIG              oxcf;
   VP9_PTR             cpi;
   unsigned char          *cx_data;
@@ -131,7 +131,7 @@
 
 static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
                                        const vpx_codec_enc_cfg_t *cfg,
-                                       const struct vp8_extracfg *vp8_cfg) {
+                                       const struct vp9_extracfg *vp8_cfg) {
   RANGE_CHECK(cfg, g_w,                   1, 65535); /* 16 bits available */
   RANGE_CHECK(cfg, g_h,                   1, 65535); /* 16 bits available */
   RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);
@@ -211,11 +211,12 @@
   switch (img->fmt) {
     case VPX_IMG_FMT_YV12:
     case VPX_IMG_FMT_I420:
-    case VPX_IMG_FMT_VPXI420:
-    case VPX_IMG_FMT_VPXYV12:
+    case VPX_IMG_FMT_I422:
+    case VPX_IMG_FMT_I444:
       break;
     default:
-      ERROR("Invalid image format. Only YV12 and I420 images are supported");
+      ERROR("Invalid image format. Only YV12, I420, I422, I444 images are "
+            "supported.");
   }
 
   if ((img->d_w != ctx->cfg.g_w) || (img->d_h != ctx->cfg.g_h))
@@ -225,9 +226,9 @@
 }
 
 
-static vpx_codec_err_t set_vp8e_config(VP9_CONFIG *oxcf,
+static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
                                        vpx_codec_enc_cfg_t cfg,
-                                       struct vp8_extracfg vp8_cfg) {
+                                       struct vp9_extracfg vp8_cfg) {
   oxcf->version = cfg.g_profile | (vp8_cfg.experimental ? 0x4 : 0);
   oxcf->width   = cfg.g_w;
   oxcf->height  = cfg.g_h;
@@ -350,7 +351,7 @@
   return VPX_CODEC_OK;
 }
 
-static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t       *ctx,
+static vpx_codec_err_t vp9e_set_config(vpx_codec_alg_priv_t       *ctx,
                                        const vpx_codec_enc_cfg_t  *cfg) {
   vpx_codec_err_t res;
 
@@ -369,7 +370,7 @@
 
   if (!res) {
     ctx->cfg = *cfg;
-    set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
+    set_vp9e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
     vp9_change_config(ctx->cpi, &ctx->oxcf);
   }
 
@@ -405,7 +406,7 @@
                                  int                   ctrl_id,
                                  va_list               args) {
   vpx_codec_err_t     res  = VPX_CODEC_OK;
-  struct vp8_extracfg xcfg = ctx->vp8_cfg;
+  struct vp9_extracfg xcfg = ctx->vp8_cfg;
 
 #define MAP(id, var) case id: var = CAST(id, args); break;
 
@@ -432,7 +433,7 @@
 
   if (!res) {
     ctx->vp8_cfg = xcfg;
-    set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
+    set_vp9e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
     vp9_change_config(ctx->cpi, &ctx->oxcf);
   }
 
@@ -441,7 +442,7 @@
 }
 
 
-static vpx_codec_err_t vp8e_common_init(vpx_codec_ctx_t *ctx,
+static vpx_codec_err_t vp9e_common_init(vpx_codec_ctx_t *ctx,
                                         int              experimental) {
   vpx_codec_err_t            res = VPX_CODEC_OK;
   struct vpx_codec_alg_priv *priv;
@@ -486,7 +487,10 @@
     priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
     priv->vp8_cfg.experimental = experimental;
 
-    priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;
+    // TODO(agrange) Check the limits set on this buffer, or the check that is
+    // applied in vp9e_encode.
+    priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 8;
+//    priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;
 
     if (priv->cx_data_sz < 4096) priv->cx_data_sz = 4096;
 
@@ -501,7 +505,7 @@
     res = validate_config(priv, &priv->cfg, &priv->vp8_cfg);
 
     if (!res) {
-      set_vp8e_config(&ctx->priv->alg_priv->oxcf,
+      set_vp9e_config(&ctx->priv->alg_priv->oxcf,
                       ctx->priv->alg_priv->cfg,
                       ctx->priv->alg_priv->vp8_cfg);
       optr = vp9_create_compressor(&ctx->priv->alg_priv->oxcf);
@@ -517,21 +521,21 @@
 }
 
 
-static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,
+static vpx_codec_err_t vp9e_init(vpx_codec_ctx_t *ctx,
                                  vpx_codec_priv_enc_mr_cfg_t *data) {
-  return vp8e_common_init(ctx, 0);
+  return vp9e_common_init(ctx, 0);
 }
 
 
 #if CONFIG_EXPERIMENTAL
-static vpx_codec_err_t vp8e_exp_init(vpx_codec_ctx_t *ctx,
+static vpx_codec_err_t vp9e_exp_init(vpx_codec_ctx_t *ctx,
                                      vpx_codec_priv_enc_mr_cfg_t *data) {
-  return vp8e_common_init(ctx, 1);
+  return vp9e_common_init(ctx, 1);
 }
 #endif
 
 
-static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx) {
+static vpx_codec_err_t vp9e_destroy(vpx_codec_alg_priv_t *ctx) {
 
   free(ctx->cx_data);
   vp9_remove_compressor(&ctx->cpi);
@@ -539,28 +543,6 @@
   return VPX_CODEC_OK;
 }
 
-static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
-                                       YV12_BUFFER_CONFIG  *yv12) {
-  vpx_codec_err_t        res = VPX_CODEC_OK;
-  yv12->y_buffer = img->planes[VPX_PLANE_Y];
-  yv12->u_buffer = img->planes[VPX_PLANE_U];
-  yv12->v_buffer = img->planes[VPX_PLANE_V];
-
-  yv12->y_crop_width  = img->d_w;
-  yv12->y_crop_height = img->d_h;
-  yv12->y_width  = img->d_w;
-  yv12->y_height = img->d_h;
-  yv12->uv_width = (1 + yv12->y_width) / 2;
-  yv12->uv_height = (1 + yv12->y_height) / 2;
-
-  yv12->y_stride = img->stride[VPX_PLANE_Y];
-  yv12->uv_stride = img->stride[VPX_PLANE_U];
-
-  yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
-  yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12); // REG_YUV = 0
-  return res;
-}
-
 static void pick_quickcompress_mode(vpx_codec_alg_priv_t  *ctx,
                                     unsigned long          duration,
                                     unsigned long          deadline) {
@@ -626,7 +608,7 @@
   return index_sz;
 }
 
-static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,
+static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t  *ctx,
                                    const vpx_image_t     *img,
                                    vpx_codec_pts_t        pts,
                                    unsigned long          duration,
@@ -754,7 +736,7 @@
         vpx_codec_cx_pkt_t pkt;
         VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
 
-        /* Pack invisible frames with the next visisble frame */
+        /* Pack invisible frames with the next visible frame */
         if (!cpi->common.show_frame) {
           if (!ctx->pending_cx_data)
             ctx->pending_cx_data = cx_data;
@@ -849,12 +831,12 @@
 }
 
 
-static const vpx_codec_cx_pkt_t *vp8e_get_cxdata(vpx_codec_alg_priv_t  *ctx,
+static const vpx_codec_cx_pkt_t *vp9e_get_cxdata(vpx_codec_alg_priv_t  *ctx,
                                                  vpx_codec_iter_t      *iter) {
   return vpx_codec_pkt_list_get(&ctx->pkt_list.head, iter);
 }
 
-static vpx_codec_err_t vp8e_set_reference(vpx_codec_alg_priv_t *ctx,
+static vpx_codec_err_t vp9e_set_reference(vpx_codec_alg_priv_t *ctx,
                                           int ctr_id,
                                           va_list args) {
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
@@ -871,7 +853,7 @@
 
 }
 
-static vpx_codec_err_t vp8e_copy_reference(vpx_codec_alg_priv_t *ctx,
+static vpx_codec_err_t vp9e_copy_reference(vpx_codec_alg_priv_t *ctx,
                                            int ctr_id,
                                            va_list args) {
 
@@ -904,7 +886,7 @@
   }
 }
 
-static vpx_codec_err_t vp8e_set_previewpp(vpx_codec_alg_priv_t *ctx,
+static vpx_codec_err_t vp9e_set_previewpp(vpx_codec_alg_priv_t *ctx,
                                           int ctr_id,
                                           va_list args) {
 #if CONFIG_POSTPROC
@@ -925,7 +907,7 @@
 }
 
 
-static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx) {
+static vpx_image_t *vp9e_get_preview(vpx_codec_alg_priv_t *ctx) {
 
   YV12_BUFFER_CONFIG sd;
   vp9_ppflags_t flags = {0};
@@ -937,45 +919,13 @@
   }
 
   if (0 == vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags)) {
-
-    /*
-    vpx_img_wrap(&ctx->preview_img, VPX_IMG_FMT_YV12,
-        sd.y_width + 2*VP9BORDERINPIXELS,
-        sd.y_height + 2*VP9BORDERINPIXELS,
-        1,
-        sd.buffer_alloc);
-    vpx_img_set_rect(&ctx->preview_img,
-        VP9BORDERINPIXELS, VP9BORDERINPIXELS,
-        sd.y_width, sd.y_height);
-        */
-
-    ctx->preview_img.bps = 12;
-    ctx->preview_img.planes[VPX_PLANE_Y] = sd.y_buffer;
-    ctx->preview_img.planes[VPX_PLANE_U] = sd.u_buffer;
-    ctx->preview_img.planes[VPX_PLANE_V] = sd.v_buffer;
-
-    if (sd.clrtype == REG_YUV)
-      ctx->preview_img.fmt = VPX_IMG_FMT_I420;
-    else
-      ctx->preview_img.fmt = VPX_IMG_FMT_VPXI420;
-
-    ctx->preview_img.x_chroma_shift = 1;
-    ctx->preview_img.y_chroma_shift = 1;
-
-    ctx->preview_img.d_w = sd.y_width;
-    ctx->preview_img.d_h = sd.y_height;
-    ctx->preview_img.stride[VPX_PLANE_Y] = sd.y_stride;
-    ctx->preview_img.stride[VPX_PLANE_U] = sd.uv_stride;
-    ctx->preview_img.stride[VPX_PLANE_V] = sd.uv_stride;
-    ctx->preview_img.w   = sd.y_width;
-    ctx->preview_img.h   = sd.y_height;
-
+    yuvconfig2image(&ctx->preview_img, &sd, NULL);
     return &ctx->preview_img;
   } else
     return NULL;
 }
 
-static vpx_codec_err_t vp8e_update_entropy(vpx_codec_alg_priv_t *ctx,
+static vpx_codec_err_t vp9e_update_entropy(vpx_codec_alg_priv_t *ctx,
                                            int ctr_id,
                                            va_list args) {
   int update = va_arg(args, int);
@@ -984,7 +934,7 @@
 
 }
 
-static vpx_codec_err_t vp8e_update_reference(vpx_codec_alg_priv_t *ctx,
+static vpx_codec_err_t vp9e_update_reference(vpx_codec_alg_priv_t *ctx,
                                              int ctr_id,
                                              va_list args) {
   int update = va_arg(args, int);
@@ -992,7 +942,7 @@
   return VPX_CODEC_OK;
 }
 
-static vpx_codec_err_t vp8e_use_reference(vpx_codec_alg_priv_t *ctx,
+static vpx_codec_err_t vp9e_use_reference(vpx_codec_alg_priv_t *ctx,
                                           int ctr_id,
                                           va_list args) {
   int reference_flag = va_arg(args, int);
@@ -1000,7 +950,7 @@
   return VPX_CODEC_OK;
 }
 
-static vpx_codec_err_t vp8e_set_roi_map(vpx_codec_alg_priv_t *ctx,
+static vpx_codec_err_t vp9e_set_roi_map(vpx_codec_alg_priv_t *ctx,
                                         int ctr_id,
                                         va_list args) {
   vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *);
@@ -1018,7 +968,7 @@
 }
 
 
-static vpx_codec_err_t vp8e_set_activemap(vpx_codec_alg_priv_t *ctx,
+static vpx_codec_err_t vp9e_set_activemap(vpx_codec_alg_priv_t *ctx,
                                           int ctr_id,
                                           va_list args) {
   vpx_active_map_t *data = va_arg(args, vpx_active_map_t *);
@@ -1035,7 +985,7 @@
     return VPX_CODEC_INVALID_PARAM;
 }
 
-static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,
+static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx,
                                           int ctr_id,
                                           va_list args) {
 
@@ -1056,16 +1006,16 @@
 }
 
 
-static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {
-  {VP8_SET_REFERENCE,                 vp8e_set_reference},
-  {VP8_COPY_REFERENCE,                vp8e_copy_reference},
-  {VP8_SET_POSTPROC,                  vp8e_set_previewpp},
-  {VP8E_UPD_ENTROPY,                  vp8e_update_entropy},
-  {VP8E_UPD_REFERENCE,                vp8e_update_reference},
-  {VP8E_USE_REFERENCE,                vp8e_use_reference},
-  {VP8E_SET_ROI_MAP,                  vp8e_set_roi_map},
-  {VP8E_SET_ACTIVEMAP,                vp8e_set_activemap},
-  {VP8E_SET_SCALEMODE,                vp8e_set_scalemode},
+static vpx_codec_ctrl_fn_map_t vp9e_ctf_maps[] = {
+  {VP8_SET_REFERENCE,                 vp9e_set_reference},
+  {VP8_COPY_REFERENCE,                vp9e_copy_reference},
+  {VP8_SET_POSTPROC,                  vp9e_set_previewpp},
+  {VP8E_UPD_ENTROPY,                  vp9e_update_entropy},
+  {VP8E_UPD_REFERENCE,                vp9e_update_reference},
+  {VP8E_USE_REFERENCE,                vp9e_use_reference},
+  {VP8E_SET_ROI_MAP,                  vp9e_set_roi_map},
+  {VP8E_SET_ACTIVEMAP,                vp9e_set_activemap},
+  {VP8E_SET_SCALEMODE,                vp9e_set_scalemode},
   {VP8E_SET_CPUUSED,                  set_param},
   {VP8E_SET_NOISE_SENSITIVITY,        set_param},
   {VP8E_SET_ENABLEAUTOALTREF,         set_param},
@@ -1086,7 +1036,7 @@
   { -1, NULL},
 };
 
-static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
+static vpx_codec_enc_cfg_map_t vp9e_usage_cfg_map[] = {
   {
     0,
     {
@@ -1151,9 +1101,9 @@
   VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR |
   VPX_CODEC_CAP_OUTPUT_PARTITION,
   /* vpx_codec_caps_t          caps; */
-  vp8e_init,          /* vpx_codec_init_fn_t       init; */
-  vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */
-  vp8e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
+  vp9e_init,          /* vpx_codec_init_fn_t       init; */
+  vp9e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */
+  vp9e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
   NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */
   NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */
   {
@@ -1163,12 +1113,12 @@
     NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */
   },
   {
-    vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */
-    vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */
-    vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
-    vp8e_set_config,
+    vp9e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */
+    vp9e_encode,        /* vpx_codec_encode_fn_t      encode; */
+    vp9e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
+    vp9e_set_config,
     NOT_IMPLEMENTED,
-    vp8e_get_preview,
+    vp9e_get_preview,
   } /* encoder functions */
 };
 
@@ -1180,9 +1130,9 @@
   VPX_CODEC_INTERNAL_ABI_VERSION,
   VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR,
   /* vpx_codec_caps_t          caps; */
-  vp8e_exp_init,      /* vpx_codec_init_fn_t       init; */
-  vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */
-  vp8e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
+  vp9e_exp_init,      /* vpx_codec_init_fn_t       init; */
+  vp9e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */
+  vp9e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
   NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */
   NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */
   {
@@ -1192,12 +1142,12 @@
     NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */
   },
   {
-    vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */
-    vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */
-    vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
-    vp8e_set_config,
+    vp9e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */
+    vp9e_encode,        /* vpx_codec_encode_fn_t      encode; */
+    vp9e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
+    vp9e_set_config,
     NOT_IMPLEMENTED,
-    vp8e_get_preview,
+    vp9e_get_preview,
   } /* encoder functions */
 };
 #endif
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -215,26 +215,19 @@
   if (data + data_sz <= data)
     res = VPX_CODEC_INVALID_PARAM;
   else {
-    /* Parse uncompresssed part of key frame header.
-     * 3 bytes:- including version, frame type and an offset
-     * 3 bytes:- sync code (0x9d, 0x01, 0x2a)
-     * 4 bytes:- including image width and height in the lowest 14 bits
-     *           of each 2-byte value.
-     */
     si->is_kf = 0;
 
-    if (data_sz >= 10 && !(data[0] & 0x01)) { /* I-Frame */
-      const uint8_t *c = data + 3;
+    if (data_sz >= 8 && (data[0] & 0xD8) == 0x80) { /* I-Frame */
+      const uint8_t *c = data + 1;
       si->is_kf = 1;
 
-      /* vet via sync code */
-      if (c[0] != 0x9d || c[1] != 0x01 || c[2] != 0x2a)
+      if (c[0] != SYNC_CODE_0 || c[1] != SYNC_CODE_1 || c[2] != SYNC_CODE_2)
         res = VPX_CODEC_UNSUP_BITSTREAM;
 
-      si->w = (c[3] | (c[4] << 8));
-      si->h = (c[5] | (c[6] << 8));
+      si->w = (c[3] << 8) | c[4];
+      si->h = (c[5] << 8) | c[6];
 
-      /*printf("w=%d, h=%d\n", si->w, si->h);*/
+      // printf("w=%d, h=%d\n", si->w, si->h);
       if (!(si->h | si->w))
         res = VPX_CODEC_UNSUP_BITSTREAM;
     } else
@@ -242,7 +235,6 @@
   }
 
   return res;
-
 }
 
 static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t    *ctx,
@@ -329,9 +321,9 @@
 
       vp9_initialize_dec();
 
-      oxcf.Width = ctx->si.w;
-      oxcf.Height = ctx->si.h;
-      oxcf.Version = 9;
+      oxcf.width = ctx->si.w;
+      oxcf.height = ctx->si.h;
+      oxcf.version = 9;
       oxcf.postprocess = 0;
       oxcf.max_threads = ctx->cfg.threads;
       oxcf.inv_tile_order = ctx->invert_tile_order;
@@ -574,30 +566,6 @@
     vp8_finalize_mmaps(ctx->priv->alg_priv);
     res = ctx->iface->init(ctx, NULL);
   }
-
-  return res;
-}
-
-static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
-                                       YV12_BUFFER_CONFIG  *yv12) {
-  vpx_codec_err_t        res = VPX_CODEC_OK;
-  yv12->y_buffer = img->planes[VPX_PLANE_Y];
-  yv12->u_buffer = img->planes[VPX_PLANE_U];
-  yv12->v_buffer = img->planes[VPX_PLANE_V];
-
-  yv12->y_crop_width  = img->d_w;
-  yv12->y_crop_height = img->d_h;
-  yv12->y_width  = img->d_w;
-  yv12->y_height = img->d_h;
-  yv12->uv_width = yv12->y_width / 2;
-  yv12->uv_height = yv12->y_height / 2;
-
-  yv12->y_stride = img->stride[VPX_PLANE_Y];
-  yv12->uv_stride = img->stride[VPX_PLANE_U];
-
-  yv12->border  = (img->stride[VPX_PLANE_Y] - img->d_w) / 2;
-  yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 ||
-                   img->fmt == VPX_IMG_FMT_VPXYV12);
 
   return res;
 }
--- a/vp9/vp9_iface_common.h
+++ b/vp9/vp9_iface_common.h
@@ -10,30 +10,39 @@
 #ifndef VP9_VP9_IFACE_COMMON_H_
 #define VP9_VP9_IFACE_COMMON_H_
 
-static void yuvconfig2image(vpx_image_t               *img,
-                            const YV12_BUFFER_CONFIG  *yv12,
-                            void                      *user_priv) {
+static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG  *yv12,
+                            void *user_priv) {
   /** vpx_img_wrap() doesn't allow specifying independent strides for
     * the Y, U, and V planes, nor other alignment adjustments that
     * might be representable by a YV12_BUFFER_CONFIG, so we just
     * initialize all the fields.*/
-  img->fmt = yv12->clrtype == REG_YUV ?
-             VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;
+  int bps = 12;
+  if (yv12->uv_height == yv12->y_height) {
+    if (yv12->uv_width == yv12->y_width) {
+      img->fmt = VPX_IMG_FMT_I444;
+      bps = 24;
+    } else {
+      img->fmt = VPX_IMG_FMT_I422;
+      bps = 16;
+    }
+  } else {
+    img->fmt = VPX_IMG_FMT_I420;
+  }
   img->w = yv12->y_stride;
-  img->h = (yv12->y_height + 2 * VP9BORDERINPIXELS + 15) & ~15;
-  img->d_w = yv12->y_width;
-  img->d_h = yv12->y_height;
-  img->x_chroma_shift = 1;
-  img->y_chroma_shift = 1;
+  img->h = multiple8(yv12->y_height + 2 * VP9BORDERINPIXELS);
+  img->d_w = yv12->y_crop_width;
+  img->d_h = yv12->y_crop_height;
+  img->x_chroma_shift = yv12->uv_width < yv12->y_width;
+  img->y_chroma_shift = yv12->uv_height < yv12->y_height;
   img->planes[VPX_PLANE_Y] = yv12->y_buffer;
   img->planes[VPX_PLANE_U] = yv12->u_buffer;
   img->planes[VPX_PLANE_V] = yv12->v_buffer;
-  img->planes[VPX_PLANE_ALPHA] = NULL;
+  img->planes[VPX_PLANE_ALPHA] = yv12->alpha_buffer;
   img->stride[VPX_PLANE_Y] = yv12->y_stride;
   img->stride[VPX_PLANE_U] = yv12->uv_stride;
   img->stride[VPX_PLANE_V] = yv12->uv_stride;
-  img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
-  img->bps = 12;
+  img->stride[VPX_PLANE_ALPHA] = yv12->alpha_stride;
+  img->bps = bps;
   img->user_priv = user_priv;
   img->img_data = yv12->buffer_alloc;
   img->img_data_owner = 0;
@@ -40,4 +49,41 @@
   img->self_allocd = 0;
 }
 
+static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
+                                       YV12_BUFFER_CONFIG *yv12) {
+  yv12->y_buffer = img->planes[VPX_PLANE_Y];
+  yv12->u_buffer = img->planes[VPX_PLANE_U];
+  yv12->v_buffer = img->planes[VPX_PLANE_V];
+  yv12->alpha_buffer = img->planes[VPX_PLANE_ALPHA];
+
+  yv12->y_crop_width  = img->d_w;
+  yv12->y_crop_height = img->d_h;
+  yv12->y_width  = img->d_w;
+  yv12->y_height = img->d_h;
+
+  yv12->uv_width = img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2
+                                            : yv12->y_width;
+  yv12->uv_height = img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2
+                                             : yv12->y_height;
+
+  yv12->alpha_width = yv12->alpha_buffer ? img->d_w : 0;
+  yv12->alpha_height = yv12->alpha_buffer ? img->d_h : 0;
+
+  yv12->y_stride = img->stride[VPX_PLANE_Y];
+  yv12->uv_stride = img->stride[VPX_PLANE_U];
+  yv12->alpha_stride = yv12->alpha_buffer ? img->stride[VPX_PLANE_ALPHA] : 0;
+
+  yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
+  yv12->clrtype = REG_YUV;
+
+#if CONFIG_ALPHA
+  // For development purposes, force alpha to hold the same data a Y for now.
+  yv12->alpha_buffer = yv12->y_buffer;
+  yv12->alpha_width = yv12->y_width;
+  yv12->alpha_height = yv12->y_height;
+  yv12->alpha_stride = yv12->y_stride;
 #endif
+  return VPX_CODEC_OK;
+}
+
+#endif  // VP9_VP9_IFACE_COMMON_H_
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -17,16 +17,6 @@
 
 VP9_CX_SRCS-yes += vp9_cx_iface.c
 
-# encoder
-#INCLUDES += algo/vpx_common/vpx_mem/include
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += common
-#INCLUDES += algo/vpx_ref/cpu_id/include
-#INCLUDES += common
-#INCLUDES += encoder
-
-VP9_CX_SRCS-yes += encoder/vp9_asm_enc_offsets.c
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.c
 VP9_CX_SRCS-yes += encoder/vp9_boolhuff.c
 VP9_CX_SRCS-yes += encoder/vp9_dct.c
@@ -38,6 +28,7 @@
 VP9_CX_SRCS-yes += encoder/vp9_firstpass.c
 VP9_CX_SRCS-yes += encoder/vp9_block.h
 VP9_CX_SRCS-yes += encoder/vp9_boolhuff.h
+VP9_CX_SRCS-yes += encoder/vp9_write_bit_buffer.h
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.h
 VP9_CX_SRCS-yes += encoder/vp9_encodeintra.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemb.h
@@ -82,7 +73,6 @@
 
 
 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_mcomp_x86.h
-VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_quantize_x86.h
 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_x86_csystemdependent.c
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm
@@ -95,7 +85,6 @@
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm
-#VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
@@ -102,21 +91,10 @@
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_ssse3.c
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_impl_ssse3.asm
-#VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
-#VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_quantize_sse4.asm
-VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_quantize_mmx.asm
 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm
 VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
-ifeq ($(HAVE_SSE2),yes)
-vp9/encoder/x86/vp9_dct_sse2.c.d: CFLAGS += -msse2
-vp9/encoder/x86/vp9_dct_sse2.c.o: CFLAGS += -msse2
-endif
 
-
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
-
-$(eval $(call asm_offsets_template,\
-         vp9_asm_enc_offsets.asm, $(VP9_PREFIX)encoder/vp9_asm_enc_offsets.c))
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@ -22,11 +22,10 @@
 VP9_DX_SRCS-yes += decoder/vp9_decodemv.c
 VP9_DX_SRCS-yes += decoder/vp9_decodframe.c
 VP9_DX_SRCS-yes += decoder/vp9_decodframe.h
-VP9_DX_SRCS-yes += decoder/vp9_dequantize.c
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.c
 VP9_DX_SRCS-yes += decoder/vp9_dboolhuff.h
+VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.h
 VP9_DX_SRCS-yes += decoder/vp9_decodemv.h
-VP9_DX_SRCS-yes += decoder/vp9_dequantize.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.h
 VP9_DX_SRCS-yes += decoder/vp9_onyxd.h
 VP9_DX_SRCS-yes += decoder/vp9_onyxd_int.h
@@ -33,16 +32,11 @@
 VP9_DX_SRCS-yes += decoder/vp9_treereader.h
 VP9_DX_SRCS-yes += decoder/vp9_onyxd_if.c
 VP9_DX_SRCS-yes += decoder/vp9_idct_blk.c
+VP9_DX_SRCS-yes += decoder/vp9_idct_blk.h
 
 VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))
 
-VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_idct_blk_sse2.c
-
 VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_dequantize_sse2.c
-ifeq ($(HAVE_SSE2),yes)
-vp9/decoder/x86/vp9_dequantize_sse2.c.o: CFLAGS += -msse2
-vp9/decoder/x86/vp9_dequantize_sse2.c.d: CFLAGS += -msse2
-endif
 
 $(eval $(call asm_offsets_template,\
          vp9_asm_dec_offsets.asm, $(VP9_PREFIX)decoder/vp9_asm_dec_offsets.c))
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -215,9 +215,13 @@
   unsigned char *roi_map;      /**< specify an id between 0 and 3 for each 16x16 region within a frame */
   unsigned int   rows;         /**< number of rows */
   unsigned int   cols;         /**< number of cols */
-  int     delta_q[4];          /**< quantizer delta [-63, 63] off baseline for regions with id between 0 and 3*/
-  int     delta_lf[4];         /**< loop filter strength delta [-63, 63] for regions with id between 0 and 3 */
-  unsigned int   static_threshold[4];/**< threshold for region to be treated as static */
+  // TODO(paulwilkins): broken for VP9 which has 8 segments
+  // q and loop filter deltas for each segment
+  // (see MAX_MB_SEGMENTS)
+  int     delta_q[4];
+  int     delta_lf[4];
+  // Static breakout threshold for each segment
+  unsigned int   static_threshold[4];
 } vpx_roi_map_t;
 
 /*!\brief  vpx active region map
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -55,9 +55,11 @@
     VPX_IMG_FMT_YV12    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 1, /**< planar YVU */
     VPX_IMG_FMT_I420    = VPX_IMG_FMT_PLANAR | 2,
     VPX_IMG_FMT_VPXYV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 3, /** < planar 4:2:0 format with vpx color space */
-    VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4   /** < planar 4:2:0 format with vpx color space */
-  }
-                        vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
+    VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4,
+    VPX_IMG_FMT_I422    = VPX_IMG_FMT_PLANAR | 5,
+    VPX_IMG_FMT_I444    = VPX_IMG_FMT_PLANAR | 6,
+    VPX_IMG_FMT_444A    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 7
+  } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
 
 #if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT
 #define IMG_FMT_PLANAR         VPX_IMG_FMT_PLANAR     /**< \deprecated Use #VPX_IMG_FMT_PLANAR */
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
+#include "./vpx_config.h"
 #include "vpx_scale/yv12config.h"
 #include "vpx_mem/vpx_mem.h"
 
@@ -76,6 +76,10 @@
     ybf->uv_height = uv_height;
     ybf->uv_stride = uv_stride;
 
+    ybf->alpha_width = 0;
+    ybf->alpha_height = 0;
+    ybf->alpha_stride = 0;
+
     ybf->border = border;
     ybf->frame_size = frame_size;
 
@@ -82,6 +86,7 @@
     ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;
     ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2  * uv_stride) + border / 2;
     ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2  * uv_stride) + border / 2;
+    ybf->alpha_buffer = NULL;
 
     ybf->corrupted = 0; /* assume not currupted by errors */
     return 0;
@@ -97,3 +102,107 @@
   }
   return -2;
 }
+
+#if CONFIG_VP9
+// TODO(jkoleszar): Maybe replace this with struct vpx_image
+
+int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
+  if (ybf) {
+    vpx_free(ybf->buffer_alloc);
+
+    /* buffer_alloc isn't accessed by most functions.  Rather y_buffer,
+      u_buffer and v_buffer point to buffer_alloc and are used.  Clear out
+      all of this so that a freed pointer isn't inadvertently used */
+    vpx_memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
+  } else {
+    return -1;
+  }
+
+  return 0;
+}
+
+int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                             int width, int height,
+                             int ss_x, int ss_y, int border) {
+  if (ybf) {
+    const int aligned_width = (width + 7) & ~7;
+    const int aligned_height = (height + 7) & ~7;
+    const int y_stride = ((aligned_width + 2 * border) + 31) & ~31;
+    const int yplane_size = (aligned_height + 2 * border) * y_stride;
+    const int uv_width = aligned_width >> ss_x;
+    const int uv_height = aligned_height >> ss_y;
+    const int uv_stride = y_stride >> ss_x;
+    const int uv_border_w = border >> ss_x;
+    const int uv_border_h = border >> ss_y;
+    const int uvplane_size = (uv_height + 2 * uv_border_h) * uv_stride;
+#if CONFIG_ALPHA
+    const int alpha_width = aligned_width;
+    const int alpha_height = aligned_height;
+    const int alpha_stride = y_stride;
+    const int alpha_border_w = border;
+    const int alpha_border_h = border;
+    const int alpha_plane_size = (alpha_height + 2 * alpha_border_h) *
+                                 alpha_stride;
+    const int frame_size = yplane_size + 2 * uvplane_size +
+                           alpha_plane_size;
+#else
+    const int frame_size = yplane_size + 2 * uvplane_size;
+#endif
+    if (!ybf->buffer_alloc) {
+      ybf->buffer_alloc = vpx_memalign(32, frame_size);
+      ybf->buffer_alloc_sz = frame_size;
+    }
+
+    if (!ybf->buffer_alloc || ybf->buffer_alloc_sz < frame_size)
+      return -1;
+
+    /* Only support allocating buffers that have a border that's a multiple
+     * of 32. The border restriction is required to get 16-byte alignment of
+     * the start of the chroma rows without intoducing an arbitrary gap
+     * between planes, which would break the semantics of things like
+     * vpx_img_set_rect(). */
+    if (border & 0x1f)
+      return -3;
+
+    ybf->y_crop_width = width;
+    ybf->y_crop_height = height;
+    ybf->y_width  = aligned_width;
+    ybf->y_height = aligned_height;
+    ybf->y_stride = y_stride;
+
+    ybf->uv_width = uv_width;
+    ybf->uv_height = uv_height;
+    ybf->uv_stride = uv_stride;
+
+    ybf->border = border;
+    ybf->frame_size = frame_size;
+
+    ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;
+    ybf->u_buffer = ybf->buffer_alloc + yplane_size +
+                    (uv_border_h * uv_stride) + uv_border_w;
+    ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size +
+                    (uv_border_h * uv_stride) + uv_border_w;
+
+#if CONFIG_ALPHA
+    ybf->alpha_width = alpha_width;
+    ybf->alpha_height = alpha_height;
+    ybf->alpha_stride = alpha_stride;
+    ybf->alpha_buffer = ybf->buffer_alloc + yplane_size + 2 * uvplane_size +
+                        (alpha_border_h * alpha_stride) + alpha_border_w;
+#endif
+    ybf->corrupted = 0; /* assume not currupted by errors */
+    return 0;
+  }
+  return -2;
+}
+
+int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                           int width, int height,
+                           int ss_x, int ss_y, int border) {
+  if (ybf) {
+    vp9_free_frame_buffer(ybf);
+    return vp9_realloc_frame_buffer(ybf, width, height, ss_x, ss_y, border);
+  }
+  return -2;
+}
+#endif
--- a/vpx_scale/generic/yv12extend.c
+++ b/vpx_scale/generic/yv12extend.c
@@ -9,6 +9,7 @@
  */
 
 #include <assert.h>
+#include "./vpx_config.h"
 #include "vpx_scale/yv12config.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/vpx_scale.h"
@@ -94,6 +95,36 @@
                (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2);
 }
 
+#if CONFIG_VP9
+void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
+                                int subsampling_x, int subsampling_y) {
+  const int c_w = (ybf->y_crop_width + subsampling_x) >> subsampling_x;
+  const int c_h = (ybf->y_crop_height + subsampling_y) >> subsampling_y;
+  const int c_et = ybf->border >> subsampling_y;
+  const int c_el = ybf->border >> subsampling_x;
+  const int c_eb = (ybf->border + ybf->y_height - ybf->y_crop_height +
+                    subsampling_y) >> subsampling_y;
+  const int c_er = (ybf->border + ybf->y_width - ybf->y_crop_width +
+                    subsampling_x) >> subsampling_x;
+
+  assert(ybf->y_height - ybf->y_crop_height < 16);
+  assert(ybf->y_width - ybf->y_crop_width < 16);
+  assert(ybf->y_height - ybf->y_crop_height >= 0);
+  assert(ybf->y_width - ybf->y_crop_width >= 0);
+
+  extend_plane(ybf->y_buffer, ybf->y_stride,
+               ybf->y_crop_width, ybf->y_crop_height,
+               ybf->border, ybf->border,
+               ybf->border + ybf->y_height - ybf->y_crop_height,
+               ybf->border + ybf->y_width - ybf->y_crop_width);
+
+  extend_plane(ybf->u_buffer, ybf->uv_stride,
+               c_w, c_h, c_et, c_el, c_eb, c_er);
+
+  extend_plane(ybf->v_buffer, ybf->uv_stride,
+               c_w, c_h, c_et, c_el, c_eb, c_er);
+}
+#endif
 
 /****************************************************************************
  *
--- a/vpx_scale/vpx_scale_rtcd.sh
+++ b/vpx_scale/vpx_scale_rtcd.sh
@@ -24,3 +24,8 @@
 
 prototype void vp8_yv12_copy_y "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
 specialize vp8_yv12_copy_y neon
+
+if [ "$CONFIG_VP9" = "yes" ]; then
+    prototype void vp9_extend_frame_borders "struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y"
+    specialize vp9_extend_frame_borders
+fi
--- a/vpx_scale/yv12config.h
+++ b/vpx_scale/yv12config.h
@@ -18,7 +18,7 @@
 #include "vpx/vpx_integer.h"
 
 #define VP8BORDERINPIXELS       32
-#define VP9BORDERINPIXELS       64
+#define VP9BORDERINPIXELS       96
 #define VP9_INTERP_EXTEND        4
 
   /*************************************
@@ -52,9 +52,14 @@
     int   uv_stride;
     /*    int   uvinternal_width; */
 
+    int   alpha_width;
+    int   alpha_height;
+    int   alpha_stride;
+
     uint8_t *y_buffer;
     uint8_t *u_buffer;
     uint8_t *v_buffer;
+    uint8_t *alpha_buffer;
 
     uint8_t *buffer_alloc;
     int buffer_alloc_sz;
@@ -71,6 +76,14 @@
   int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                                     int width, int height, int border);
   int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf);
+
+  int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                             int width, int height, int ss_x, int ss_y,
+                             int border);
+  int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                               int width, int height, int ss_x, int ss_y,
+                               int border);
+  int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
 
 #ifdef __cplusplus
 }
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -12,6 +12,7 @@
 /* This is a simple program that reads ivf files and decodes them
  * using the new interface. Decoded frames are output as YV12 raw.
  */
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>
@@ -891,6 +892,7 @@
 
   if (use_y4m && !noblit) {
     char buffer[128];
+
     if (!single_file) {
       fprintf(stderr, "YUV4MPEG2 not supported with output patterns,"
               " try --i420 or --yv12.\n");
@@ -908,8 +910,8 @@
     /*Note: We can't output an aspect ratio here because IVF doesn't
        store one, and neither does VP8.
       That will have to wait until these tools support WebM natively.*/
-    sprintf(buffer, "YUV4MPEG2 C%s W%u H%u F%u:%u I%c\n",
-            "420jpeg", width, height, fps_num, fps_den, 'p');
+    snprintf(buffer, sizeof(buffer), "YUV4MPEG2 W%u H%u F%u:%u I%c ",
+             width, height, fps_num, fps_den, 'p');
     out_put(out, (unsigned char *)buffer,
             (unsigned int)strlen(buffer), do_md5);
   }
@@ -1036,6 +1038,17 @@
       show_progress(frame_in, frame_out, dx_time);
 
     if (!noblit) {
+      if (frame_out == 1 && img && use_y4m) {
+        /* Write out the color format to terminate the header line */
+        const char *color =
+            img->fmt == VPX_IMG_FMT_444A ? "C444alpha\n" :
+            img->fmt == VPX_IMG_FMT_I444 ? "C444\n" :
+            img->fmt == VPX_IMG_FMT_I422 ? "C422\n" :
+            "C420jpeg\n";
+
+        out_put(out, (const unsigned char*)color, strlen(color), do_md5);
+      }
+
       if (do_scale) {
         if (img && frame_out == 1) {
           stream_w = img->d_w;
@@ -1044,6 +1057,7 @@
                                      stream_w, stream_h, 16);
         }
         if (img && (img->d_w != stream_w || img->d_h != stream_h)) {
+          assert(img->fmt == VPX_IMG_FMT_I420);
           I420Scale(img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y],
                     img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U],
                     img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V],
@@ -1064,6 +1078,12 @@
         unsigned int y;
         char out_fn[PATH_MAX];
         uint8_t *buf;
+        unsigned int c_w =
+            img->x_chroma_shift ? (1 + img->d_w) >> img->x_chroma_shift
+                                : img->d_w;
+        unsigned int c_h =
+            img->y_chroma_shift ? (1 + img->d_h) >> img->y_chroma_shift
+                                : img->d_h;
 
         if (!single_file) {
           size_t len = sizeof(out_fn) - 1;
@@ -1084,15 +1104,15 @@
 
         buf = img->planes[flipuv ? VPX_PLANE_V : VPX_PLANE_U];
 
-        for (y = 0; y < (1 + img->d_h) / 2; y++) {
-          out_put(out, buf, (1 + img->d_w) / 2, do_md5);
+        for (y = 0; y < c_h; y++) {
+          out_put(out, buf, c_w, do_md5);
           buf += img->stride[VPX_PLANE_U];
         }
 
         buf = img->planes[flipuv ? VPX_PLANE_U : VPX_PLANE_V];
 
-        for (y = 0; y < (1 + img->d_h) / 2; y++) {
-          out_put(out, buf, (1 + img->d_w) / 2, do_md5);
+        for (y = 0; y < c_h; y++) {
+          out_put(out, buf, c_w, do_md5);
           buf += img->stride[VPX_PLANE_V];
         }
 
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -326,6 +326,7 @@
   unsigned int          h;
   struct vpx_rational   framerate;
   int                   use_i420;
+  int                   only_i420;
 };
 
 
@@ -1481,9 +1482,12 @@
 
 #define mmin(a, b)  ((a) < (b) ? (a) : (b))
 static void find_mismatch(vpx_image_t *img1, vpx_image_t *img2,
-                          int yloc[2], int uloc[2], int vloc[2]) {
+                          int yloc[4], int uloc[4], int vloc[4]) {
   const unsigned int bsize = 64;
-  const unsigned int bsize2 = bsize >> 1;
+  const unsigned int bsizey = bsize >> img1->y_chroma_shift;
+  const unsigned int bsizex = bsize >> img1->x_chroma_shift;
+  const int c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const int c_h = (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
   unsigned int match = 1;
   unsigned int i, j;
   yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
@@ -1510,12 +1514,13 @@
         }
     }
   }
+
   uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
-  for (i = 0, match = 1; match && i < (img1->d_h + 1) / 2; i += bsize2) {
-    for (j = 0; j < match && (img1->d_w + 1) / 2; j += bsize2) {
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
       int k, l;
-      int si = mmin(i + bsize2, (img1->d_h + 1) / 2) - i;
-      int sj = mmin(j + bsize2, (img1->d_w + 1) / 2) - j;
+      int si = mmin(i + bsizey, c_h - i);
+      int sj = mmin(j + bsizex, c_w - j);
       for (k = 0; match && k < si; k++)
         for (l = 0; match && l < sj; l++) {
           if (*(img1->planes[VPX_PLANE_U] +
@@ -1535,11 +1540,11 @@
     }
   }
   vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
-  for (i = 0, match = 1; match && i < (img1->d_h + 1) / 2; i += bsize2) {
-    for (j = 0; j < match && (img1->d_w + 1) / 2; j += bsize2) {
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
       int k, l;
-      int si = mmin(i + bsize2, (img1->d_h + 1) / 2) - i;
-      int sj = mmin(j + bsize2, (img1->d_w + 1) / 2) - j;
+      int si = mmin(i + bsizey, c_h - i);
+      int sj = mmin(j + bsizex, c_w - j);
       for (k = 0; match && k < si; k++)
         for (l = 0; match && l < sj; l++) {
           if (*(img1->planes[VPX_PLANE_V] +
@@ -1562,6 +1567,8 @@
 
 static int compare_img(vpx_image_t *img1, vpx_image_t *img2)
 {
+  const int c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const int c_h = (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
   int match = 1;
   unsigned int i;
 
@@ -1574,15 +1581,15 @@
                      img2->planes[VPX_PLANE_Y]+i*img2->stride[VPX_PLANE_Y],
                      img1->d_w) == 0);
 
-  for (i = 0; i < img1->d_h/2; i++)
+  for (i = 0; i < c_h; i++)
     match &= (memcmp(img1->planes[VPX_PLANE_U]+i*img1->stride[VPX_PLANE_U],
                      img2->planes[VPX_PLANE_U]+i*img2->stride[VPX_PLANE_U],
-                     (img1->d_w + 1) / 2) == 0);
+                     c_w) == 0);
 
-  for (i = 0; i < img1->d_h/2; i++)
+  for (i = 0; i < c_h; i++)
     match &= (memcmp(img1->planes[VPX_PLANE_V]+i*img1->stride[VPX_PLANE_U],
                      img2->planes[VPX_PLANE_V]+i*img2->stride[VPX_PLANE_U],
-                     (img1->d_w + 1) / 2) == 0);
+                     c_w) == 0);
 
   return match;
 }
@@ -1792,7 +1799,8 @@
 
   if (input->detect.buf_read == 4
       && file_is_y4m(input->file, &input->y4m, input->detect.buf)) {
-    if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4) >= 0) {
+    if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4,
+                       input->only_i420) >= 0) {
       input->file_type = FILE_TYPE_Y4M;
       input->w = input->y4m.pic_w;
       input->h = input->y4m.pic_h;
@@ -2516,6 +2524,7 @@
   input.framerate.num = 30;
   input.framerate.den = 1;
   input.use_i420 = 1;
+  input.only_i420 = 1;
 
   /* First parse the global configuration values, because we want to apply
    * other parameters on top of the default configuration provided by the
@@ -2549,6 +2558,12 @@
 
   if (!input.fn)
     usage_exit();
+
+#if CONFIG_NON420
+  /* Decide if other chroma subsamplings than 4:2:0 are supported */
+  if (global.codec->fourcc == VP9_FOURCC)
+    input.only_i420 = 0;
+#endif
 
   for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) {
     int frames_in = 0, seen_frames = 0;
--- a/y4minput.c
+++ b/y4minput.c
@@ -659,7 +659,8 @@
                              unsigned char *_aux) {
 }
 
-int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip) {
+int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
+                   int only_420) {
   char buffer[80];
   int  ret;
   int  i;
@@ -701,6 +702,8 @@
             "Only progressive scan handled.\n");
     return -1;
   }
+  _y4m->vpx_fmt = VPX_IMG_FMT_I420;
+  _y4m->vpx_bps = 12;
   if (strcmp(_y4m->chroma_type, "420") == 0 ||
       strcmp(_y4m->chroma_type, "420jpeg") == 0) {
     _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = _y4m->dst_c_dec_v = 2;
@@ -734,16 +737,30 @@
     _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
     _y4m->convert = y4m_convert_422jpeg_420jpeg;
   } else if (strcmp(_y4m->chroma_type, "422") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = 2;
+    _y4m->src_c_dec_h = 2;
     _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
-    /*Chroma filter required: read into the aux buf first.
-      We need to make two filter passes, so we need some extra space in the
-       aux buffer.*/
-    _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->convert = y4m_convert_422_420jpeg;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.*/
+      _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz +
+          ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->convert = y4m_convert_422_420jpeg;
+    } else {
+      _y4m->vpx_fmt = VPX_IMG_FMT_I422;
+      _y4m->vpx_bps = 16;
+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h
+                              + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      /*Natively supported: no conversion required.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+      _y4m->convert = y4m_convert_null;
+      }
   } else if (strcmp(_y4m->chroma_type, "411") == 0) {
     _y4m->src_c_dec_h = 4;
     _y4m->dst_c_dec_h = 2;
@@ -758,29 +775,52 @@
     _y4m->convert = y4m_convert_411_420jpeg;
   } else if (strcmp(_y4m->chroma_type, "444") == 0) {
     _y4m->src_c_dec_h = 1;
-    _y4m->dst_c_dec_h = 2;
     _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
-    /*Chroma filter required: read into the aux buf first.
-      We need to make two filter passes, so we need some extra space in the
-       aux buffer.*/
-    _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h;
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->convert = y4m_convert_444_420jpeg;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.*/
+      _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h;
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz +
+          ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->convert = y4m_convert_444_420jpeg;
+    } else {
+      _y4m->vpx_fmt = VPX_IMG_FMT_I444;
+      _y4m->vpx_bps = 24;
+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+      _y4m->dst_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
+      /*Natively supported: no conversion required.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+      _y4m->convert = y4m_convert_null;
+    }
   } else if (strcmp(_y4m->chroma_type, "444alpha") == 0) {
     _y4m->src_c_dec_h = 1;
-    _y4m->dst_c_dec_h = 2;
     _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
-    /*Chroma filter required: read into the aux buf first.
-      We need to make two filter passes, so we need some extra space in the
-       aux buffer.
-      The extra plane also gets read into the aux buf.
-      It will be discarded.*/
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
-    _y4m->convert = y4m_convert_444_420jpeg;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.
+        The extra plane also gets read into the aux buf.
+        It will be discarded.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
+      _y4m->convert = y4m_convert_444_420jpeg;
+    } else {
+      _y4m->vpx_fmt = VPX_IMG_FMT_444A;
+      _y4m->vpx_bps = 32;
+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+      _y4m->dst_buf_read_sz = 4 * _y4m->pic_w * _y4m->pic_h;
+      /*Natively supported: no conversion required.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+      _y4m->convert = y4m_convert_null;
+    }
   } else if (strcmp(_y4m->chroma_type, "mono") == 0) {
     _y4m->src_c_dec_h = _y4m->src_c_dec_v = 0;
     _y4m->dst_c_dec_h = _y4m->dst_c_dec_v = 2;
@@ -847,22 +887,23 @@
      sizes, which would require a separate fread call for every row.*/
   memset(_img, 0, sizeof(*_img));
   /*Y4M has the planes in Y'CbCr order, which libvpx calls Y, U, and V.*/
-  _img->fmt = IMG_FMT_I420;
+  _img->fmt = _y4m->vpx_fmt;
   _img->w = _img->d_w = _y4m->pic_w;
   _img->h = _img->d_h = _y4m->pic_h;
-  /*This is hard-coded to 4:2:0 for now, as that's all VP8 supports.*/
-  _img->x_chroma_shift = 1;
-  _img->y_chroma_shift = 1;
-  _img->bps = 12;
+  _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1;
+  _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1;
+  _img->bps = _y4m->vpx_bps;
+
   /*Set up the buffer pointers.*/
   pic_sz = _y4m->pic_w * _y4m->pic_h;
   c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
   c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
   c_sz = c_w * c_h;
-  _img->stride[PLANE_Y] = _y4m->pic_w;
+  _img->stride[PLANE_Y] = _img->stride[PLANE_ALPHA] = _y4m->pic_w;
   _img->stride[PLANE_U] = _img->stride[PLANE_V] = c_w;
   _img->planes[PLANE_Y] = _y4m->dst_buf;
   _img->planes[PLANE_U] = _y4m->dst_buf + pic_sz;
   _img->planes[PLANE_V] = _y4m->dst_buf + pic_sz + c_sz;
+  _img->planes[PLANE_ALPHA] = _y4m->dst_buf + pic_sz + 2 * c_sz;
   return 1;
 }
--- a/y4minput.h
+++ b/y4minput.h
@@ -51,9 +51,12 @@
   y4m_convert_func  convert;
   unsigned char    *dst_buf;
   unsigned char    *aux_buf;
+  enum vpx_img_fmt  vpx_fmt;
+  int               vpx_bps;
 };
 
-int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip);
+int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
+                   int only_420);
 void y4m_input_close(y4m_input *_y4m);
 int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *img);